Merge remote-tracking branch 'upstream/3.4' into merge-3.4

This commit is contained in:
Alexander Alekhin 2018-11-21 08:33:39 +00:00
commit 7fa7fa0226
31 changed files with 895 additions and 114 deletions

View File

@ -323,7 +323,6 @@ OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add
OCV_OPTION(ENABLE_COVERAGE "Enable coverage collection with GCov" OFF IF CV_GCC )
OCV_OPTION(ENABLE_OMIT_FRAME_POINTER "Enable -fomit-frame-pointer for GCC" ON IF CV_GCC )
OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" ON IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
OCV_OPTION(ENABLE_VSX "Enable POWER8 and above VSX (64-bit little-endian)" ON IF ((CV_GCC OR CV_CLANG) AND PPC64LE) )
OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CV_GCC AND (X86 OR X86_64)) )
if(NOT IOS AND (NOT ANDROID OR OPENCV_ANDROID_USE_LEGACY_FLAGS)) # Use CPU_BASELINE instead
OCV_OPTION(ENABLE_NEON "Enable NEON instructions" (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )

View File

@ -5,6 +5,10 @@
# AVX / AVX2 / AVX_512F
# FMA3
# ppc64le arch:
# VSX (always available on Power8)
# VSX3 (always available on Power9)
# CPU_{opt}_SUPPORTED=ON/OFF - compiler support (possibly with additional flag)
# CPU_{opt}_IMPLIES=<list>
# CPU_{opt}_FORCE=<list> - subset of "implies" list
@ -29,7 +33,7 @@
set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_SKX")
list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
list(APPEND CPU_ALL_OPTIMIZATIONS VSX)
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
@ -81,7 +85,7 @@ ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON)
ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF)
ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF)
ocv_optimization_process_obsolete_option(ENABLE_VSX VSX OFF)
ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON)
macro(ocv_is_optimization_in_list resultvar check_opt)
set(__checked "")
@ -289,14 +293,24 @@ elseif(ARM OR AARCH64)
set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
endif()
elseif(PPC64LE)
ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX")
ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3")
ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp")
ocv_update(CPU_VSX3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx3.cpp")
if(NOT OPENCV_CPU_OPT_IMPLIES_IGNORE)
ocv_update(CPU_VSX3_IMPLIES "VSX")
endif()
if(CV_CLANG AND (NOT ${CMAKE_CXX_COMPILER} MATCHES "xlc"))
ocv_update(CPU_VSX_FLAGS_ON "-mvsx -maltivec")
ocv_update(CPU_VSX3_FLAGS_ON "-mpower9-vector")
else()
ocv_update(CPU_VSX_FLAGS_ON "-mcpu=power8")
ocv_update(CPU_VSX3_FLAGS_ON "-mcpu=power9 -mtune=power9")
endif()
set(CPU_DISPATCH "VSX3" CACHE STRING "${HELP_CPU_DISPATCH}")
set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}")
endif()
# Helper values for cmake-gui

View File

@ -1,8 +1,12 @@
# if defined(__VSX__)
# include <altivec.h>
# else
# error "VSX is not supported"
# endif
#if defined(__VSX__)
#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
#include <altivec.h>
#else
#error "OpenCV only supports little-endian mode"
#endif
#else
#error "VSX is not supported"
#endif
int main()
{

17
cmake/checks/cpu_vsx3.cpp Normal file
View File

@ -0,0 +1,17 @@
#if defined(__VSX__)
#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
#include <altivec.h>
#else
#error "OpenCV only supports little-endian mode"
#endif
#else
#error "VSX3 is not supported"
#endif
int main()
{
__vector unsigned char a = vec_splats((unsigned char)1);
__vector unsigned char b = vec_splats((unsigned char)2);
__vector unsigned char r = vec_absd(a, b);
return 0;
}

View File

@ -2,7 +2,7 @@ set(the_description "The Core Functionality")
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
ocv_add_dispatched_file(stat SSE4_2 AVX2)
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2)
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
# dispatching for accuracy tests
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)

View File

@ -107,7 +107,7 @@
# include <arm_neon.h>
#endif
#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
#ifdef CV_CPU_COMPILE_VSX
# include <altivec.h>
# undef vector
# undef pixel
@ -115,6 +115,10 @@
# define CV_VSX 1
#endif
#ifdef CV_CPU_COMPILE_VSX3
# define CV_VSX3 1
#endif
#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
@ -237,3 +241,7 @@ struct VZeroUpperGuard {
#ifndef CV_VSX
# define CV_VSX 0
#endif
#ifndef CV_VSX3
# define CV_VSX3 0
#endif

View File

@ -315,5 +315,26 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_VSX(fn, args, mode, ...) CV_CPU_CALL_VSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX3
# define CV_TRY_VSX3 1
# define CV_CPU_FORCE_VSX3 1
# define CV_CPU_HAS_SUPPORT_VSX3 1
# define CV_CPU_CALL_VSX3(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_VSX3_(fn, args) return (opt_VSX3::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX3
# define CV_TRY_VSX3 1
# define CV_CPU_FORCE_VSX3 0
# define CV_CPU_HAS_SUPPORT_VSX3 (cv::checkHardwareSupport(CV_CPU_VSX3))
# define CV_CPU_CALL_VSX3(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args)
# define CV_CPU_CALL_VSX3_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args)
#else
# define CV_TRY_VSX3 0
# define CV_CPU_FORCE_VSX3 0
# define CV_CPU_HAS_SUPPORT_VSX3 0
# define CV_CPU_CALL_VSX3(fn, args)
# define CV_CPU_CALL_VSX3_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_VSX3(fn, args, mode, ...) CV_CPU_CALL_VSX3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */

View File

@ -229,6 +229,7 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
#define CV_CPU_NEON 100
#define CV_CPU_VSX 200
#define CV_CPU_VSX3 201
// CPU features groups
#define CV_CPU_AVX512_SKX 256
@ -266,6 +267,7 @@ enum CpuFeatures {
CPU_NEON = 100,
CPU_VSX = 200,
CPU_VSX3 = 201,
CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL

View File

@ -905,6 +905,11 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
inline v_float32x8 v_not_nan(const v_float32x8& a)
{ return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
inline v_float64x4 v_not_nan(const v_float64x4& a)
{ return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
/** min/max **/
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)

View File

@ -683,6 +683,25 @@ OPENCV_HAL_IMPL_CMP_OP(==)
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP(!=)
template<int n>
inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
{
typedef typename V_TypeTraits<float>::int_type itype;
v_reg<float, n> c;
for (int i = 0; i < n; i++)
c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
return c;
}
template<int n>
inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
{
typedef typename V_TypeTraits<double>::int_type itype;
v_reg<double, n> c;
for (int i = 0; i < n; i++)
c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
return c;
}
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \

View File

@ -764,6 +764,13 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
#endif
inline v_float32x4 v_not_nan(const v_float32x4& a)
{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
#if CV_SIMD128_64F
inline v_float64x2 v_not_nan(const v_float64x2& a)
{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
#endif
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)

View File

@ -1041,6 +1041,11 @@ inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
inline v_float32x4 v_not_nan(const v_float32x4& a)
{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
inline v_float64x2 v_not_nan(const v_float64x2& a)
{ return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)

View File

@ -607,6 +607,11 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
inline v_float32x4 v_not_nan(const v_float32x4& a)
{ return v_float32x4(vec_cmpeq(a.val, a.val)); }
inline v_float64x2 v_not_nan(const v_float64x2& a)
{ return v_float64x2(vec_cmpeq(a.val, a.val)); }
/** min/max **/
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)

View File

@ -1941,8 +1941,11 @@ Rect_<_Tp>& operator += ( Rect_<_Tp>& a, const Size_<_Tp>& b )
template<typename _Tp> static inline
Rect_<_Tp>& operator -= ( Rect_<_Tp>& a, const Size_<_Tp>& b )
{
a.width -= b.width;
a.height -= b.height;
const _Tp width = a.width - b.width;
const _Tp height = a.height - b.height;
CV_DbgAssert(width >= 0 && height >= 0);
a.width = width;
a.height = height;
return a;
}
@ -2007,6 +2010,15 @@ Rect_<_Tp> operator + (const Rect_<_Tp>& a, const Size_<_Tp>& b)
return Rect_<_Tp>( a.x, a.y, a.width + b.width, a.height + b.height );
}
template<typename _Tp> static inline
Rect_<_Tp> operator - (const Rect_<_Tp>& a, const Size_<_Tp>& b)
{
const _Tp width = a.width - b.width;
const _Tp height = a.height - b.height;
CV_DbgAssert(width >= 0 && height >= 0);
return Rect_<_Tp>( a.x, a.y, width, height );
}
template<typename _Tp> static inline
Rect_<_Tp> operator & (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
{

View File

@ -454,7 +454,7 @@ static inline int _initMaxThreads()
{
omp_set_dynamic(maxThreads);
}
return numThreads;
return maxThreads;
}
static int numThreadsMax = _initMaxThreads();
#elif defined HAVE_GCD

View File

@ -107,15 +107,14 @@ void* allocSingletonBuffer(size_t size) { return fastMalloc(size); }
# include <cpu-features.h>
#endif
#ifndef __VSX__
# if defined __PPC64__ && defined __linux__
#if CV_VSX && defined __linux__
# include "sys/auxv.h"
# ifndef AT_HWCAP2
# define AT_HWCAP2 26
# endif
# ifndef PPC_FEATURE2_ARCH_2_07
# define PPC_FEATURE2_ARCH_2_07 0x80000000
# endif
# ifndef PPC_FEATURE2_ARCH_3_00
# define PPC_FEATURE2_ARCH_3_00 0x00800000
# endif
#endif
@ -359,6 +358,7 @@ struct HWFeatures
g_hwFeatureNames[CPU_NEON] = "NEON";
g_hwFeatureNames[CPU_VSX] = "VSX";
g_hwFeatureNames[CPU_VSX3] = "VSX3";
g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX";
}
@ -513,14 +513,14 @@ struct HWFeatures
#endif
#endif
#ifdef __VSX__
have[CV_CPU_VSX] = true;
#elif (defined __PPC64__ && defined __linux__)
uint64 hwcaps = getauxval(AT_HWCAP);
// there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
have[CV_CPU_VSX] = (CV_VSX);
// TODO: Check VSX3 availability in runtime for other platforms
#if CV_VSX && defined __linux__
uint64 hwcap2 = getauxval(AT_HWCAP2);
have[CV_CPU_VSX] = (hwcaps & PPC_FEATURE_PPC_LE && hwcaps & PPC_FEATURE_HAS_VSX && hwcap2 & PPC_FEATURE2_ARCH_2_07);
have[CV_CPU_VSX3] = (hwcap2 & PPC_FEATURE2_ARCH_3_00);
#else
have[CV_CPU_VSX] = false;
have[CV_CPU_VSX3] = (CV_VSX3);
#endif
int baseline_features[] = { CV_CPU_BASELINE_FEATURES };

View File

@ -972,6 +972,13 @@ bool CV_OperationsTest::operations1()
if (sz.width != 10 || sz.height != 20) throw test_excep();
if (cvSize(sz).width != 10 || cvSize(sz).height != 20) throw test_excep();
Rect r1(0, 0, 10, 20);
Size sz1(5, 10);
r1 -= sz1;
if (r1.size().width != 5 || r1.size().height != 10) throw test_excep();
Rect r2 = r1 - sz1;
if (r2.size().width != 0 || r2.size().height != 0) throw test_excep();
Vec<double, 5> v5d(1, 1, 1, 1, 1);
Vec<double, 6> v6d(1, 1, 1, 1, 1, 1);
Vec<double, 7> v7d(1, 1, 1, 1, 1, 1, 1);

View File

@ -83,7 +83,9 @@ CV__DNN_INLINE_NS_BEGIN
DNN_TARGET_OPENCL,
DNN_TARGET_OPENCL_FP16,
DNN_TARGET_MYRIAD,
DNN_TARGET_VULKAN
DNN_TARGET_VULKAN,
//! FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
DNN_TARGET_FPGA
};
/** @brief This class provides all data needed to initialize layer.
@ -497,6 +499,7 @@ CV__DNN_INLINE_NS_BEGIN
* | DNN_TARGET_OPENCL | + | + | + |
* | DNN_TARGET_OPENCL_FP16 | + | + | |
* | DNN_TARGET_MYRIAD | | + | |
* | DNN_TARGET_FPGA | | + | |
*/
CV_WRAP void setPreferableTarget(int targetId);

View File

@ -6,7 +6,7 @@
#define OPENCV_DNN_VERSION_HPP
/// Use with major OpenCV version only.
#define OPENCV_DNN_API_VERSION 20180917
#define OPENCV_DNN_API_VERSION 20181121
#if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_INLINE_NS
#define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)

View File

@ -42,7 +42,7 @@ public:
}
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
{
if (!checkMyriadTarget())
if (!checkIETarget(DNN_TARGET_MYRIAD))
{
throw SkipTestException("Myriad is not available/disabled in OpenCV");
}

View File

@ -1104,7 +1104,8 @@ struct Net::Impl
preferableTarget == DNN_TARGET_CPU ||
preferableTarget == DNN_TARGET_OPENCL ||
preferableTarget == DNN_TARGET_OPENCL_FP16 ||
preferableTarget == DNN_TARGET_MYRIAD);
preferableTarget == DNN_TARGET_MYRIAD ||
preferableTarget == DNN_TARGET_FPGA);
CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
preferableTarget == DNN_TARGET_VULKAN);
if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
@ -1609,7 +1610,9 @@ struct Net::Impl
ieNode->net = net;
auto weightableLayer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(ieNode->layer);
if ((preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD) && !fused)
if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
preferableTarget == DNN_TARGET_MYRIAD ||
preferableTarget == DNN_TARGET_FPGA) && !fused)
{
ieNode->layer->precision = InferenceEngine::Precision::FP16;
if (weightableLayer)

View File

@ -119,8 +119,8 @@ public:
lp.precision = InferenceEngine::Precision::FP32;
std::shared_ptr<InferenceEngine::SplitLayer> ieLayer(new InferenceEngine::SplitLayer(lp));
#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R3)
ieLayer->params["axis"] = format("%d", input->dims.size() - 1);
ieLayer->params["out_sizes"] = format("%d", input->dims[0]);
ieLayer->params["axis"] = format("%d", (int)input->dims.size() - 1);
ieLayer->params["out_sizes"] = format("%d", (int)input->dims[0]);
#endif
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
#endif // HAVE_INF_ENGINE

View File

@ -220,9 +220,14 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
#ifdef HAVE_INF_ENGINE
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
return preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height;
{
return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R4) ||
(preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height);
}
else
#endif
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE ||
(backendId == DNN_BACKEND_VKCOM && haveVulkan());

View File

@ -302,7 +302,8 @@ void InfEngineBackendNet::setTargetDevice(InferenceEngine::TargetDevice device)
{
if (device != InferenceEngine::TargetDevice::eCPU &&
device != InferenceEngine::TargetDevice::eGPU &&
device != InferenceEngine::TargetDevice::eMYRIAD)
device != InferenceEngine::TargetDevice::eMYRIAD &&
device != InferenceEngine::TargetDevice::eFPGA)
CV_Error(Error::StsNotImplemented, "");
targetDevice = device;
}
@ -314,7 +315,8 @@ InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() CV_NOEXCEPT
InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() const CV_NOEXCEPT
{
return targetDevice;
return targetDevice == InferenceEngine::TargetDevice::eFPGA ?
InferenceEngine::TargetDevice::eHETERO : targetDevice;
}
InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t) CV_NOEXCEPT
@ -466,6 +468,11 @@ void InfEngineBackendNet::init(int targetId)
setPrecision(InferenceEngine::Precision::FP16);
setTargetDevice(InferenceEngine::TargetDevice::eMYRIAD); break;
}
case DNN_TARGET_FPGA:
{
setPrecision(InferenceEngine::Precision::FP16);
setTargetDevice(InferenceEngine::TargetDevice::eFPGA); break;
}
default:
CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId));
}
@ -489,10 +496,15 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
}
else
{
enginePtr = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(targetDevice);
auto dispatcher = InferenceEngine::PluginDispatcher({""});
if (targetDevice == InferenceEngine::TargetDevice::eFPGA)
enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
else
enginePtr = dispatcher.getSuitablePlugin(targetDevice);
sharedPlugins[targetDevice] = enginePtr;
if (targetDevice == InferenceEngine::TargetDevice::eCPU)
if (targetDevice == InferenceEngine::TargetDevice::eCPU ||
targetDevice == InferenceEngine::TargetDevice::eFPGA)
{
std::string suffixes[] = {"_avx2", "_sse4", ""};
bool haveFeature[] = {

View File

@ -68,6 +68,7 @@ static inline void PrintTo(const cv::dnn::Target& v, std::ostream* os)
case DNN_TARGET_OPENCL_FP16: *os << "OCL_FP16"; return;
case DNN_TARGET_MYRIAD: *os << "MYRIAD"; return;
case DNN_TARGET_VULKAN: *os << "VULKAN"; return;
case DNN_TARGET_FPGA: *os << "FPGA"; return;
} // don't use "default:" to emit compiler warnings
*os << "DNN_TARGET_UNKNOWN(" << (int)v << ")";
}
@ -190,7 +191,7 @@ static inline void normAssertDetections(cv::Mat ref, cv::Mat out, const char *co
testBoxes, comment, confThreshold, scores_diff, boxes_iou_diff);
}
static inline bool checkMyriadTarget()
static inline bool checkIETarget(int target)
{
#ifndef HAVE_INF_ENGINE
return false;
@ -199,7 +200,7 @@ static inline bool checkMyriadTarget()
cv::dnn::LayerParams lp;
net.addLayerToPrev("testLayer", "Identity", lp);
net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
net.setPreferableTarget(cv::dnn::DNN_TARGET_MYRIAD);
net.setPreferableTarget(target);
static int inpDims[] = {1, 2, 3, 4};
net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
try
@ -267,7 +268,7 @@ testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargets
targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16));
}
#endif
if (checkMyriadTarget())
if (checkIETarget(DNN_TARGET_MYRIAD))
targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD));
}
#endif
@ -351,7 +352,7 @@ public:
}
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
{
if (!checkMyriadTarget())
if (!checkIETarget(DNN_TARGET_MYRIAD))
{
throw SkipTestException("Myriad is not available/disabled in OpenCV");
}

View File

@ -57,28 +57,29 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
InferencePlugin plugin;
ExecutableNetwork netExec;
InferRequest infRequest;
TargetDevice targetDevice;
try
{
auto dispatcher = InferenceEngine::PluginDispatcher({""});
switch (target)
{
case DNN_TARGET_CPU:
targetDevice = TargetDevice::eCPU;
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eCPU);
break;
case DNN_TARGET_OPENCL:
case DNN_TARGET_OPENCL_FP16:
targetDevice = TargetDevice::eGPU;
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eGPU);
break;
case DNN_TARGET_MYRIAD:
targetDevice = TargetDevice::eMYRIAD;
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eMYRIAD);
break;
case DNN_TARGET_FPGA:
enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
break;
default:
CV_Error(Error::StsNotImplemented, "Unknown target");
};
try
{
enginePtr = PluginDispatcher({""}).getSuitablePlugin(targetDevice);
if (targetDevice == TargetDevice::eCPU)
if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA)
{
std::string suffixes[] = {"_avx2", "_sse4", ""};
bool haveFeature[] = {
@ -255,8 +256,10 @@ static testing::internal::ParamGenerator<Target> dnnDLIETargets()
targets.push_back(DNN_TARGET_OPENCL_FP16);
}
#endif
if (checkMyriadTarget())
if (checkIETarget(DNN_TARGET_MYRIAD))
targets.push_back(DNN_TARGET_MYRIAD);
if (checkIETarget(DNN_TARGET_FPGA))
targets.push_back(DNN_TARGET_FPGA);
return testing::ValuesIn(targets);
}

View File

@ -351,7 +351,7 @@ TEST_P(Test_Caffe_layers, Conv_Elu)
{
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
{
if (!checkMyriadTarget())
if (!checkIETarget(DNN_TARGET_MYRIAD))
throw SkipTestException("Myriad is not available/disabled in OpenCV");
}

View File

@ -157,7 +157,7 @@ TEST_P(setInput, normalization)
const int target = get<1>(get<3>(GetParam()));
const bool kSwapRB = true;
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && !checkMyriadTarget())
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && !checkIETarget(DNN_TARGET_MYRIAD))
throw SkipTestException("Myriad is not available/disabled in OpenCV");
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16 && dtype != CV_32F)
throw SkipTestException("");

View File

@ -117,7 +117,7 @@ void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, In
end = keypoints.end();
for( ; it != end; ++it )
{
Scalar color = isRandColor ? Scalar(rng(256), rng(256), rng(256)) : _color;
Scalar color = isRandColor ? Scalar( rng(256), rng(256), rng(256), 255 ) : _color;
_drawKeypoint( outImage, *it, color, flags );
}
}
@ -173,7 +173,7 @@ static inline void _drawMatch( InputOutputArray outImg, InputOutputArray outImg1
{
RNG& rng = theRNG();
bool isRandMatchColor = matchColor == Scalar::all(-1);
Scalar color = isRandMatchColor ? Scalar( rng(256), rng(256), rng(256) ) : matchColor;
Scalar color = isRandMatchColor ? Scalar( rng(256), rng(256), rng(256), 255 ) : matchColor;
_drawKeypoint( outImg1, kp1, color, flags );
_drawKeypoint( outImg2, kp2, color, flags );

View File

@ -82,7 +82,84 @@ public:
memset(buf.data(), 0, buf.size() * sizeof(float));
float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
for( k = 0; k < maxk; k++ )
k = 0;
for(; k <= maxk-4; k+=4)
{
const uchar* ksptr0 = sptr + space_ofs[k];
const uchar* ksptr1 = sptr + space_ofs[k+1];
const uchar* ksptr2 = sptr + space_ofs[k+2];
const uchar* ksptr3 = sptr + space_ofs[k+3];
j = 0;
#if CV_SIMD
v_float32 kweight0 = vx_setall_f32(space_weight[k]);
v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
{
v_uint32 rval = vx_load_expand_q(sptr + j);
v_uint32 val = vx_load_expand_q(ksptr0 + j);
v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j));
val = vx_load_expand_q(ksptr1 + j);
w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
v_wsum += w;
v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
val = vx_load_expand_q(ksptr2 + j);
w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
v_wsum += w;
v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
val = vx_load_expand_q(ksptr3 + j);
w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
v_wsum += w;
v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
v_store_aligned(wsum + j, v_wsum);
v_store_aligned(sum + j, v_sum);
}
#endif
#if CV_SIMD128
v_float32x4 kweight4 = v_load(space_weight + k);
#endif
for (; j < size.width; j++)
{
#if CV_SIMD128
v_uint32x4 rval = v_setall_u32(sptr[j]);
v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
wsum[j] += v_reduce_sum(w);
sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w);
#else
int rval = sptr[j];
int val = ksptr0[j];
float w = space_weight[k] * color_weight[std::abs(val - rval)];
wsum[j] += w;
sum[j] += val * w;
val = ksptr1[j];
w = space_weight[k+1] * color_weight[std::abs(val - rval)];
wsum[j] += w;
sum[j] += val * w;
val = ksptr2[j];
w = space_weight[k+2] * color_weight[std::abs(val - rval)];
wsum[j] += w;
sum[j] += val * w;
val = ksptr3[j];
w = space_weight[k+3] * color_weight[std::abs(val - rval)];
wsum[j] += w;
sum[j] += val * w;
#endif
}
}
for(; k < maxk; k++)
{
const uchar* ksptr = sptr + space_ofs[k];
j = 0;
@ -126,7 +203,232 @@ public:
float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
for(k = 0; k < maxk; k++ )
k = 0;
for(; k <= maxk-4; k+=4)
{
const uchar* ksptr0 = sptr + space_ofs[k];
const uchar* ksptr1 = sptr + space_ofs[k+1];
const uchar* ksptr2 = sptr + space_ofs[k+2];
const uchar* ksptr3 = sptr + space_ofs[k+3];
const uchar* rsptr = sptr;
j = 0;
#if CV_SIMD
v_float32 kweight0 = vx_setall_f32(space_weight[k]);
v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes,
ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes)
{
v_uint8 kb, kg, kr, rb, rg, rr;
v_load_deinterleave(rsptr, rb, rg, rr);
v_load_deinterleave(ksptr0, kb, kg, kr);
v_uint16 val0, val1, val2, val3, val4;
v_expand(v_absdiff(kb, rb), val0, val1);
v_expand(v_absdiff(kg, rg), val2, val3);
val0 += val2; val1 += val3;
v_expand(v_absdiff(kr, rr), val2, val3);
val0 += val2; val1 += val3;
v_uint32 vall, valh;
v_expand(val0, vall, valh);
v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
v_expand(kb, val0, val2);
v_expand(val0, vall, valh);
v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
v_expand(kg, val0, val3);
v_expand(val0, vall, valh);
v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
v_expand(kr, val0, val4);
v_expand(val0, vall, valh);
v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
v_expand(val1, vall, valh);
w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
v_expand(val2, vall, valh);
v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
v_expand(val3, vall, valh);
v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
v_expand(val4, vall, valh);
v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
v_load_deinterleave(ksptr1, kb, kg, kr);
v_expand(v_absdiff(kb, rb), val0, val1);
v_expand(v_absdiff(kg, rg), val2, val3);
val0 += val2; val1 += val3;
v_expand(v_absdiff(kr, rr), val2, val3);
val0 += val2; val1 += val3;
v_expand(val0, vall, valh);
w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
v_expand(kb, val0, val2);
v_expand(val0, vall, valh);
v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
v_expand(kg, val0, val3);
v_expand(val0, vall, valh);
v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
v_expand(kr, val0, val4);
v_expand(val0, vall, valh);
v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
v_expand(val1, vall, valh);
w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
v_expand(val2, vall, valh);
v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
v_expand(val3, vall, valh);
v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
v_expand(val4, vall, valh);
v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
v_load_deinterleave(ksptr2, kb, kg, kr);
v_expand(v_absdiff(kb, rb), val0, val1);
v_expand(v_absdiff(kg, rg), val2, val3);
val0 += val2; val1 += val3;
v_expand(v_absdiff(kr, rr), val2, val3);
val0 += val2; val1 += val3;
v_expand(val0, vall, valh);
w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
v_expand(kb, val0, val2);
v_expand(val0, vall, valh);
v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
v_expand(kg, val0, val3);
v_expand(val0, vall, valh);
v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
v_expand(kr, val0, val4);
v_expand(val0, vall, valh);
v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
v_expand(val1, vall, valh);
w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
v_expand(val2, vall, valh);
v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
v_expand(val3, vall, valh);
v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
v_expand(val4, vall, valh);
v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
v_load_deinterleave(ksptr3, kb, kg, kr);
v_expand(v_absdiff(kb, rb), val0, val1);
v_expand(v_absdiff(kg, rg), val2, val3);
val0 += val2; val1 += val3;
v_expand(v_absdiff(kr, rr), val2, val3);
val0 += val2; val1 += val3;
v_expand(val0, vall, valh);
w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
v_expand(kb, val0, val2);
v_expand(val0, vall, valh);
v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
v_expand(kg, val0, val3);
v_expand(val0, vall, valh);
v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
v_expand(kr, val0, val4);
v_expand(val0, vall, valh);
v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
v_expand(val1, vall, valh);
w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
v_expand(val2, vall, valh);
v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
v_expand(val3, vall, valh);
v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
v_expand(val4, vall, valh);
v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
}
#endif
#if CV_SIMD128
v_float32x4 kweight4 = v_load(space_weight + k);
#endif
for(; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
{
#if CV_SIMD128
v_uint32x4 rb = v_setall_u32(rsptr[0]);
v_uint32x4 rg = v_setall_u32(rsptr[1]);
v_uint32x4 rr = v_setall_u32(rsptr[2]);
v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr)));
wsum[j] += v_reduce_sum(w);
sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w);
sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w);
sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w);
#else
int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
float w = space_weight[k]*color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
wsum[j] += w;
sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
wsum[j] += w;
sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
wsum[j] += w;
sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
wsum[j] += w;
sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
#endif
}
}
for(; k < maxk; k++)
{
const uchar* ksptr = sptr + space_ofs[k];
const uchar* rsptr = sptr;
@ -421,7 +723,130 @@ public:
v_float32 v_one = vx_setall_f32(1.f);
v_float32 sindex = vx_setall_f32(scale_index);
#endif
for( k = 0; k < maxk; k++ )
k = 0;
for(; k <= maxk - 4; k+=4)
{
const float* ksptr0 = sptr + space_ofs[k];
const float* ksptr1 = sptr + space_ofs[k + 1];
const float* ksptr2 = sptr + space_ofs[k + 2];
const float* ksptr3 = sptr + space_ofs[k + 3];
j = 0;
#if CV_SIMD
v_float32 kweight0 = vx_setall_f32(space_weight[k]);
v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
{
v_float32 rval = vx_load(sptr + j);
v_float32 val = vx_load(ksptr0 + j);
v_float32 knan = v_not_nan(val);
v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
v_int32 idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j));
val = vx_load(ksptr1 + j);
knan = v_not_nan(val);
alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
v_wsum += w;
v_sum = v_muladd(val & knan, w, v_sum);
val = vx_load(ksptr2 + j);
knan = v_not_nan(val);
alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
v_wsum += w;
v_sum = v_muladd(val & knan, w, v_sum);
val = vx_load(ksptr3 + j);
knan = v_not_nan(val);
alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
v_wsum += w;
v_sum = v_muladd(val & knan, w, v_sum);
v_store_aligned(wsum + j, v_wsum);
v_store_aligned(sum + j, v_sum);
}
#endif
#if CV_SIMD128
v_float32x4 v_one4 = v_setall_f32(1.f);
v_float32x4 sindex4 = v_setall_f32(scale_index);
v_float32x4 kweight4 = v_load(space_weight + k);
#endif
for (; j < size.width; j++)
{
#if CV_SIMD128
v_float32x4 rval = v_setall_f32(sptr[j]);
v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
v_float32x4 knan = v_not_nan(val);
v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan;
v_int32x4 idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
wsum[j] += v_reduce_sum(w);
sum[j] += v_reduce_sum((val & knan) * w);
#else
float rval = sptr[j];
float val = ksptr0[j];
float alpha = std::abs(val - rval) * scale_index;
int idx = cvFloor(alpha);
alpha -= idx;
if (!cvIsNaN(val))
{
float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum[j] += val * w;
}
val = ksptr1[j];
alpha = std::abs(val - rval) * scale_index;
idx = cvFloor(alpha);
alpha -= idx;
if (!cvIsNaN(val))
{
float w = space_weight[k+1] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum[j] += val * w;
}
val = ksptr2[j];
alpha = std::abs(val - rval) * scale_index;
idx = cvFloor(alpha);
alpha -= idx;
if (!cvIsNaN(val))
{
float w = space_weight[k+2] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum[j] += val * w;
}
val = ksptr3[j];
alpha = std::abs(val - rval) * scale_index;
idx = cvFloor(alpha);
alpha -= idx;
if (!cvIsNaN(val))
{
float w = space_weight[k+3] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum[j] += val * w;
}
#endif
}
}
for(; k < maxk; k++)
{
const float* ksptr = sptr + space_ofs[k];
j = 0;
@ -430,36 +855,44 @@ public:
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
{
v_float32 val = vx_load(ksptr + j);
v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex;
v_float32 rval = vx_load(sptr + j);
v_float32 knan = v_not_nan(val);
v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
v_int32 idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha));
v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j)));
v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j)));
}
#endif
for (; j < size.width; j++)
{
float val = ksptr[j];
float alpha = std::abs(val - sptr[j]) * scale_index;
float rval = sptr[j];
float alpha = std::abs(val - rval) * scale_index;
int idx = cvFloor(alpha);
alpha -= idx;
float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
if (!cvIsNaN(val))
{
float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum[j] += val * w;
}
}
}
j = 0;
#if CV_SIMD
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j));
{
v_float32 v_val = vx_load(sptr + j);
v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val))));
}
#endif
for (; j < size.width; j++)
{
CV_DbgAssert(fabs(wsum[j]) > 0);
dptr[j] = sum[j] / wsum[j];
CV_DbgAssert(fabs(wsum[j]) >= 0);
dptr[j] = cvIsNaN(sptr[j]) ? sum[j] / wsum[j] : (sum[j] + sptr[j]) / (wsum[j] + 1.f);
}
}
else
@ -475,7 +908,162 @@ public:
v_float32 v_one = vx_setall_f32(1.f);
v_float32 sindex = vx_setall_f32(scale_index);
#endif
for (k = 0; k < maxk; k++)
k = 0;
for (; k <= maxk-4; k+=4)
{
const float* ksptr0 = sptr + space_ofs[k];
const float* ksptr1 = sptr + space_ofs[k+1];
const float* ksptr2 = sptr + space_ofs[k+2];
const float* ksptr3 = sptr + space_ofs[k+3];
const float* rsptr = sptr;
j = 0;
#if CV_SIMD
v_float32 kweight0 = vx_setall_f32(space_weight[k]);
v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes,
ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes)
{
v_float32 kb, kg, kr, rb, rg, rr;
v_load_deinterleave(rsptr, rb, rg, rr);
v_load_deinterleave(ksptr0, kb, kg, kr);
v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
v_int32 idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j));
v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j));
v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j));
v_load_deinterleave(ksptr1, kb, kg, kr);
knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
v_wsum += w;
v_sum_b = v_muladd(kb & knan, w, v_sum_b);
v_sum_g = v_muladd(kg & knan, w, v_sum_g);
v_sum_r = v_muladd(kr & knan, w, v_sum_r);
v_load_deinterleave(ksptr2, kb, kg, kr);
knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
v_wsum += w;
v_sum_b = v_muladd(kb & knan, w, v_sum_b);
v_sum_g = v_muladd(kg & knan, w, v_sum_g);
v_sum_r = v_muladd(kr & knan, w, v_sum_r);
v_load_deinterleave(ksptr3, kb, kg, kr);
knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
v_wsum += w;
v_sum_b = v_muladd(kb & knan, w, v_sum_b);
v_sum_g = v_muladd(kg & knan, w, v_sum_g);
v_sum_r = v_muladd(kr & knan, w, v_sum_r);
v_store_aligned(wsum + j, v_wsum);
v_store_aligned(sum_b + j, v_sum_b);
v_store_aligned(sum_g + j, v_sum_g);
v_store_aligned(sum_r + j, v_sum_r);
}
#endif
#if CV_SIMD128
v_float32x4 v_one4 = v_setall_f32(1.f);
v_float32x4 sindex4 = v_setall_f32(scale_index);
v_float32x4 kweight4 = v_load(space_weight + k);
#endif
for (; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
{
#if CV_SIMD128
v_float32x4 rb = v_setall_f32(rsptr[0]);
v_float32x4 rg = v_setall_f32(rsptr[1]);
v_float32x4 rr = v_setall_f32(rsptr[2]);
v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
v_int32x4 idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
wsum[j] += v_reduce_sum(w);
sum_b[j] += v_reduce_sum((kb & knan) * w);
sum_g[j] += v_reduce_sum((kg & knan) * w);
sum_r[j] += v_reduce_sum((kr & knan) * w);
#else
float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
float b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
int idx = cvFloor(alpha);
alpha -= idx;
if (!v_NAN)
{
float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum_b[j] += b*w;
sum_g[j] += g*w;
sum_r[j] += r*w;
}
b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
idx = cvFloor(alpha);
alpha -= idx;
if (!v_NAN)
{
float w = space_weight[k+1] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum_b[j] += b*w;
sum_g[j] += g*w;
sum_r[j] += r*w;
}
b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
idx = cvFloor(alpha);
alpha -= idx;
if (!v_NAN)
{
float w = space_weight[k+2] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum_b[j] += b*w;
sum_g[j] += g*w;
sum_r[j] += r*w;
}
b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
idx = cvFloor(alpha);
alpha -= idx;
if (!v_NAN)
{
float w = space_weight[k+3] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum_b[j] += b*w;
sum_g[j] += g*w;
sum_r[j] += r*w;
}
#endif
}
}
for (; k < maxk; k++)
{
const float* ksptr = sptr + space_ofs[k];
const float* rsptr = sptr;
@ -488,46 +1076,69 @@ public:
v_load_deinterleave(ksptr, kb, kg, kr);
v_load_deinterleave(rsptr, rb, rg, rr);
v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex;
v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
v_int32 idx = v_trunc(alpha);
alpha -= v_cvt_f32(idx);
v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha));
v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j)));
v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j)));
v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j)));
v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)));
v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)));
v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)));
}
#endif
for (; j < size.width; j++, ksptr += 3, rsptr += 3)
{
float b = ksptr[0], g = ksptr[1], r = ksptr[2];
float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index;
bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
int idx = cvFloor(alpha);
alpha -= idx;
float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]));
if (!v_NAN)
{
float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
wsum[j] += w;
sum_b[j] += b*w;
sum_g[j] += g*w;
sum_r[j] += r*w;
}
}
}
j = 0;
#if CV_SIMD
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes)
for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes)
{
v_float32 w = v_one / vx_load_aligned(wsum + j);
v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w);
v_float32 b, g, r;
v_load_deinterleave(sptr, b, g, r);
v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r);
v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask));
v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w);
}
#endif
for (; j < size.width; j++)
{
CV_DbgAssert(fabs(wsum[j]) > 0);
CV_DbgAssert(fabs(wsum[j]) >= 0);
float b = *(sptr++);
float g = *(sptr++);
float r = *(sptr++);
if (cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r))
{
wsum[j] = 1.f / wsum[j];
*(dptr++) = sum_b[j] * wsum[j];
*(dptr++) = sum_g[j] * wsum[j];
*(dptr++) = sum_r[j] * wsum[j];
}
else
{
wsum[j] = 1.f / (wsum[j] + 1.f);
*(dptr++) = (sum_b[j] + b) * wsum[j];
*(dptr++) = (sum_g[j] + g) * wsum[j];
*(dptr++) = (sum_r[j] + r) * wsum[j];
}
}
}
}
#if CV_SIMD
@ -585,9 +1196,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
// temporary copy of the image with borders for easy processing
Mat temp;
copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
minValSrc -= 5. * sigma_color;
patchNaNs( temp, minValSrc ); // this replacement of NaNs makes the assumption that depth values are nonnegative
// TODO: make replacement parameter avalible in the outside function interface
// allocate lookup tables
std::vector<float> _space_weight(d*d);
std::vector<int> _space_ofs(d*d);
@ -620,7 +1229,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
for( j = -radius; j <= radius; j++ )
{
double r = std::sqrt((double)i*i + (double)j*j);
if( r > radius )
if( r > radius || ( i == 0 && j == 0 ) )
continue;
space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);

View File

@ -38,6 +38,8 @@ aspect_ratios = [float(ar) for ar in grid_anchor_generator['aspect_ratios']]
width_stride = float(grid_anchor_generator['width_stride'][0])
height_stride = float(grid_anchor_generator['height_stride'][0])
features_stride = float(config['feature_extractor'][0]['first_stage_features_stride'][0])
first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0])
first_stage_max_proposals = int(config['first_stage_max_proposals'][0])
print('Number of classes: %d' % num_classes)
print('Scales: %s' % str(scales))
@ -53,7 +55,8 @@ graph_def = parseTextGraph(args.output)
removeIdentity(graph_def)
def to_remove(name, op):
return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep)
return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
(name.startswith('CropAndResize') and op != 'CropAndResize')
removeUnusedNodesAndAttrs(to_remove, graph_def)
@ -123,20 +126,22 @@ detectionOut.input.append('proposals')
detectionOut.addAttr('num_classes', 2)
detectionOut.addAttr('share_location', True)
detectionOut.addAttr('background_label_id', 0)
detectionOut.addAttr('nms_threshold', 0.7)
detectionOut.addAttr('nms_threshold', first_stage_nms_iou_threshold)
detectionOut.addAttr('top_k', 6000)
detectionOut.addAttr('code_type', "CENTER_SIZE")
detectionOut.addAttr('keep_top_k', 100)
detectionOut.addAttr('keep_top_k', first_stage_max_proposals)
detectionOut.addAttr('clip', True)
graph_def.node.extend([detectionOut])
# Save as text.
cropAndResizeNodesNames = []
for node in reversed(topNodes):
if node.op != 'CropAndResize':
graph_def.node.extend([node])
topNodes.pop()
else:
cropAndResizeNodesNames.append(node.name)
if numCropAndResize == 1:
break
else:
@ -166,11 +171,15 @@ for i in reversed(range(len(graph_def.node))):
if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape']:
'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape',
'SecondStageBoxPredictor/Flatten_1/flatten/Shape',
'SecondStageBoxPredictor/Flatten_1/flatten/strided_slice',
'SecondStageBoxPredictor/Flatten_1/flatten/Reshape/shape']:
del graph_def.node[i]
for node in graph_def.node:
if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape':
if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape' or \
node.name == 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape':
node.op = 'Flatten'
node.input.pop()
@ -178,6 +187,12 @@ for node in graph_def.node:
'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
node.addAttr('loc_pred_transposed', True)
if node.name.startswith('MaxPool2D'):
assert(node.op == 'MaxPool')
assert(len(cropAndResizeNodesNames) == 2)
node.input = [cropAndResizeNodesNames[0]]
del cropAndResizeNodesNames[0]
################################################################################
### Postprocessing
################################################################################
@ -223,6 +238,11 @@ graph_def.node.extend([detectionOut])
for node in reversed(topNodes):
graph_def.node.extend([node])
if node.name.startswith('MaxPool2D'):
assert(node.op == 'MaxPool')
assert(len(cropAndResizeNodesNames) == 1)
node.input = [cropAndResizeNodesNames[0]]
for i in reversed(range(len(graph_def.node))):
if graph_def.node[i].op == 'CropAndResize':
graph_def.node[i].input.insert(1, 'detection_out_final')