mirror of
https://github.com/opencv/opencv.git
synced 2025-06-10 19:24:07 +08:00
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
This commit is contained in:
commit
2ad0487cec
@ -123,6 +123,9 @@ if(CV_GCC OR CV_CLANG)
|
||||
add_extra_compiler_option(-Wsign-promo)
|
||||
add_extra_compiler_option(-Wuninitialized)
|
||||
add_extra_compiler_option(-Winit-self)
|
||||
if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0))
|
||||
add_extra_compiler_option(-Wno-psabi)
|
||||
endif()
|
||||
if(HAVE_CXX11)
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT ENABLE_PRECOMPILED_HEADERS)
|
||||
add_extra_compiler_option(-Wsuggest-override)
|
||||
|
@ -845,36 +845,24 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
|
||||
/** Mask **/
|
||||
inline int v_signmask(const v_uint8x16& a)
|
||||
{
|
||||
vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7));
|
||||
static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
|
||||
sv = vec_sl(sv, slm);
|
||||
vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
|
||||
static const vec_uint4 slm4 = {0, 0, 8, 8};
|
||||
sv4 = vec_sl(sv4, slm4);
|
||||
return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
|
||||
static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
|
||||
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
|
||||
}
|
||||
inline int v_signmask(const v_int8x16& a)
|
||||
{ return v_signmask(v_reinterpret_as_u8(a)); }
|
||||
|
||||
inline int v_signmask(const v_int16x8& a)
|
||||
{
|
||||
static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7};
|
||||
vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15));
|
||||
sv = vec_sl(sv, slm);
|
||||
vec_int4 svi = vec_int4_z;
|
||||
svi = vec_sums(vec_sum4s(sv, svi), svi);
|
||||
return vec_extract(svi, 3);
|
||||
static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
|
||||
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
|
||||
}
|
||||
inline int v_signmask(const v_uint16x8& a)
|
||||
{ return v_signmask(v_reinterpret_as_s16(a)); }
|
||||
|
||||
inline int v_signmask(const v_int32x4& a)
|
||||
{
|
||||
static const vec_uint4 slm = {0, 1, 2, 3};
|
||||
vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31));
|
||||
sv = vec_sl(sv, slm);
|
||||
sv = vec_sums(sv, vec_int4_z);
|
||||
return vec_extract(sv, 3);
|
||||
static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
|
||||
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
|
||||
}
|
||||
inline int v_signmask(const v_uint32x4& a)
|
||||
{ return v_signmask(v_reinterpret_as_s32(a)); }
|
||||
|
@ -554,7 +554,9 @@ struct HWFeatures
|
||||
have[CV_CPU_FP16] = true;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
|
||||
have[CV_CPU_NEON] = true;
|
||||
#endif
|
||||
// there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
|
||||
have[CV_CPU_VSX] = (CV_VSX);
|
||||
// TODO: Check VSX3 availability in runtime for other platforms
|
||||
|
@ -160,14 +160,7 @@ TEST(Core_Ptr, assignment)
|
||||
|
||||
{
|
||||
Ptr<Reporter> p1(new Reporter(&deleted1));
|
||||
#if defined(__clang__) && (__clang_major__ >= 9) && !defined(__APPLE__)
|
||||
CV_DO_PRAGMA(GCC diagnostic push)
|
||||
CV_DO_PRAGMA(GCC diagnostic ignored "-Wself-assign-overloaded")
|
||||
#endif
|
||||
p1 = p1;
|
||||
#if defined(__clang__) && (__clang_major__ >= 9) && !defined(__APPLE__)
|
||||
CV_DO_PRAGMA(GCC diagnostic pop)
|
||||
#endif
|
||||
p1 = *&p1;
|
||||
EXPECT_FALSE(deleted1);
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,9 @@ else()
|
||||
-Wunused-parameter -Wsign-compare
|
||||
)
|
||||
endif()
|
||||
|
||||
if(HAVE_CUDA)
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
|
||||
endif()
|
||||
if(NOT HAVE_CXX11)
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-undef) # LANG_CXX11 from protobuf files
|
||||
endif()
|
||||
|
@ -123,9 +123,12 @@ PERF_TEST_P_(DNNTestNetwork, SSD)
|
||||
|
||||
PERF_TEST_P_(DNNTestNetwork, OpenFace)
|
||||
{
|
||||
if (backend == DNN_BACKEND_HALIDE ||
|
||||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
|
||||
if (backend == DNN_BACKEND_HALIDE)
|
||||
throw SkipTestException("");
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
|
||||
throw SkipTestException("");
|
||||
#endif
|
||||
processNet("dnn/openface_nn4.small2.v1.t7", "", "",
|
||||
Mat(cv::Size(96, 96), CV_32FC3));
|
||||
}
|
||||
@ -185,16 +188,6 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
|
||||
{
|
||||
if (backend == DNN_BACKEND_HALIDE)
|
||||
throw SkipTestException("");
|
||||
#if defined(INF_ENGINE_RELEASE)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
|
||||
&& getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
|
||||
throw SkipTestException("Test is disabled for MyriadX");
|
||||
#endif
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
|
||||
throw SkipTestException("Test is disabled for Myriad in OpenVINO 2019R2");
|
||||
#endif
|
||||
|
||||
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
|
||||
Mat(cv::Size(300, 300), CV_32FC3));
|
||||
}
|
||||
|
@ -719,21 +719,23 @@ struct DataLayer : public Layer
|
||||
CV_Assert(numChannels <= 4);
|
||||
|
||||
// Scale
|
||||
auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
|
||||
{numChannels});
|
||||
InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels},
|
||||
InferenceEngine::Layout::C);
|
||||
auto weights = InferenceEngine::make_shared_blob<float>(td);
|
||||
weights->allocate();
|
||||
weights->set(std::vector<float>(numChannels, scaleFactors[0]));
|
||||
|
||||
float* weight_buf = weights->buffer().as<float*>();
|
||||
std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]);
|
||||
|
||||
// Mean subtraction
|
||||
auto biases = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
|
||||
{numChannels});
|
||||
auto biases = InferenceEngine::make_shared_blob<float>(td);
|
||||
biases->allocate();
|
||||
std::vector<float> biasesVec(numChannels);
|
||||
float* bias_buf = biases->buffer().as<float*>();
|
||||
|
||||
for (int i = 0; i < numChannels; ++i)
|
||||
{
|
||||
biasesVec[i] = -means[0][i] * scaleFactors[0];
|
||||
bias_buf[i] = -means[0][i] * scaleFactors[0];
|
||||
}
|
||||
biases->set(biasesVec);
|
||||
|
||||
InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
|
||||
addConstantData("weights", weights, ieLayer);
|
||||
@ -1536,7 +1538,11 @@ struct Net::Impl
|
||||
for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
|
||||
{
|
||||
InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
|
||||
dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
|
||||
#else
|
||||
dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -1544,7 +1550,11 @@ struct Net::Impl
|
||||
for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
|
||||
{
|
||||
InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
|
||||
dataPtr->name = ld.name;
|
||||
#else
|
||||
dataPtr->setName(ld.name);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1565,7 +1575,11 @@ struct Net::Impl
|
||||
for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
|
||||
{
|
||||
InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
|
||||
dataPtr->name = netInputLayer->outNames[i];
|
||||
#else
|
||||
dataPtr->setName(netInputLayer->outNames[i]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -1573,7 +1587,11 @@ struct Net::Impl
|
||||
for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
|
||||
{
|
||||
InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
|
||||
dataPtr->name = ld.name;
|
||||
#else
|
||||
dataPtr->setName(ld.name);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
ieNode->net->addBlobs(ld.inputBlobsWrappers);
|
||||
|
@ -111,7 +111,8 @@ public:
|
||||
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
|
||||
{
|
||||
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
|
||||
CV_Assert(!input->dims.empty());
|
||||
std::vector<size_t> dims = input->getDims();
|
||||
CV_Assert(!dims.empty());
|
||||
|
||||
InferenceEngine::Builder::Layer ieLayer(name);
|
||||
ieLayer.setName(name);
|
||||
@ -122,12 +123,10 @@ public:
|
||||
else
|
||||
{
|
||||
ieLayer.setType("Split");
|
||||
ieLayer.getParameters()["axis"] = input->dims.size() - 1;
|
||||
ieLayer.getParameters()["out_sizes"] = input->dims[0];
|
||||
ieLayer.getParameters()["axis"] = dims.size() - 1;
|
||||
ieLayer.getParameters()["out_sizes"] = dims[0];
|
||||
}
|
||||
std::vector<size_t> shape(input->dims);
|
||||
std::reverse(shape.begin(), shape.end());
|
||||
ieLayer.setInputPorts({InferenceEngine::Port(shape)});
|
||||
ieLayer.setInputPorts({InferenceEngine::Port(dims)});
|
||||
ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
|
||||
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
|
||||
}
|
||||
|
@ -316,7 +316,7 @@ public:
|
||||
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
|
||||
|
||||
InferenceEngine::Builder::ConcatLayer ieLayer(name);
|
||||
ieLayer.setAxis(clamp(axis, input->dims.size()));
|
||||
ieLayer.setAxis(clamp(axis, input->getDims().size()));
|
||||
ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
|
||||
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
|
||||
}
|
||||
|
@ -541,15 +541,14 @@ public:
|
||||
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
|
||||
{
|
||||
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
|
||||
CV_Assert(input->dims.size() == 4 || input->dims.size() == 5);
|
||||
|
||||
const int inpCn = input->dims[input->dims.size() - 2]; // NOTE: input->dims are reversed (WHIO or WHDIO)
|
||||
std::vector<size_t> dims = input->getDims();
|
||||
CV_Assert(dims.size() == 4 || dims.size() == 5);
|
||||
const int inpCn = dims[1];
|
||||
const int outCn = blobs[0].size[0];
|
||||
const int inpGroupCn = blobs[0].size[1];
|
||||
const int group = inpCn / inpGroupCn;
|
||||
|
||||
InferenceEngine::Layout layout = (input->dims.size() == 4) ? InferenceEngine::Layout::OIHW :
|
||||
InferenceEngine::Layout::NCDHW;
|
||||
InferenceEngine::Layout layout = (dims.size() == 4) ? InferenceEngine::Layout::OIHW :
|
||||
InferenceEngine::Layout::NCDHW;
|
||||
|
||||
auto ieWeights = wrapToInfEngineBlob(blobs[0], layout);
|
||||
if (fusedWeights)
|
||||
@ -561,9 +560,10 @@ public:
|
||||
}
|
||||
else
|
||||
{
|
||||
ieWeights = InferenceEngine::make_shared_blob<float>(
|
||||
InferenceEngine::Precision::FP32, layout,
|
||||
ieWeights->dims());
|
||||
ieWeights = InferenceEngine::make_shared_blob<float>({
|
||||
InferenceEngine::Precision::FP32,
|
||||
ieWeights->getTensorDesc().getDims(), layout
|
||||
});
|
||||
ieWeights->allocate();
|
||||
|
||||
Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn);
|
||||
@ -1953,9 +1953,10 @@ public:
|
||||
auto ieWeights = wrapToInfEngineBlob(blobs[0], layout);
|
||||
if (fusedWeights)
|
||||
{
|
||||
ieWeights = InferenceEngine::make_shared_blob<float>(
|
||||
InferenceEngine::Precision::FP32, layout,
|
||||
ieWeights->dims());
|
||||
ieWeights = InferenceEngine::make_shared_blob<float>({
|
||||
InferenceEngine::Precision::FP32,
|
||||
ieWeights->getTensorDesc().getDims(), layout
|
||||
});
|
||||
ieWeights->allocate();
|
||||
|
||||
int inpCn = blobs[0].size[0];
|
||||
|
@ -261,7 +261,8 @@ public:
|
||||
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
|
||||
{
|
||||
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
|
||||
if (input->dims.size() == 4)
|
||||
std::vector<size_t> dims = input->getDims();
|
||||
if (dims.size() == 4)
|
||||
{
|
||||
InferenceEngine::Builder::NormalizeLayer ieLayer(name);
|
||||
|
||||
@ -270,13 +271,14 @@ public:
|
||||
ieLayer.setEpsilon(epsilon);
|
||||
|
||||
InferenceEngine::Builder::Layer l = ieLayer;
|
||||
const int numChannels = input->dims[2]; // NOTE: input->dims are reversed (whcn)
|
||||
const int numChannels = dims[1];
|
||||
InferenceEngine::Blob::Ptr weights;
|
||||
if (blobs.empty())
|
||||
{
|
||||
weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
|
||||
InferenceEngine::Layout::C,
|
||||
{(size_t)numChannels});
|
||||
weights = InferenceEngine::make_shared_blob<float>({
|
||||
InferenceEngine::Precision::FP32,
|
||||
{(size_t)numChannels}, InferenceEngine::Layout::C
|
||||
});
|
||||
weights->allocate();
|
||||
|
||||
Mat weightsMat = infEngineBlobToMat(weights).reshape(1, numChannels);
|
||||
|
@ -167,9 +167,11 @@ public:
|
||||
if (kernel_size.size() == 3)
|
||||
return preferableTarget == DNN_TARGET_CPU;
|
||||
if (preferableTarget == DNN_TARGET_MYRIAD) {
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
if (type == MAX && (pad_l == 1 && pad_t == 1) && stride == Size(2, 2) ) {
|
||||
return !isMyriadX();
|
||||
}
|
||||
#endif
|
||||
return type == MAX || type == AVE;
|
||||
}
|
||||
else
|
||||
|
@ -207,12 +207,13 @@ public:
|
||||
}
|
||||
else
|
||||
{
|
||||
auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
|
||||
{numChannels});
|
||||
auto weights = InferenceEngine::make_shared_blob<float>({
|
||||
InferenceEngine::Precision::FP32, {(size_t)numChannels},
|
||||
InferenceEngine::Layout::C
|
||||
});
|
||||
weights->allocate();
|
||||
|
||||
std::vector<float> ones(numChannels, 1);
|
||||
weights->set(ones);
|
||||
float* buf = weights->buffer().as<float*>();
|
||||
std::fill(buf, buf + numChannels, 1);
|
||||
addConstantData("weights", weights, l);
|
||||
}
|
||||
if (hasBias)
|
||||
|
@ -301,14 +301,14 @@ public:
|
||||
{
|
||||
std::vector<size_t> outShape(numDims);
|
||||
for (int i = 0; i < numDims; ++i)
|
||||
outShape[numDims - 1 - i] = sliceRanges[0][i].size();
|
||||
outShape[i] = sliceRanges[0][i].size();
|
||||
|
||||
ieLayer.getInputPorts()[1].setParameter("type", "weights");
|
||||
|
||||
// Fake blob which will be moved to inputs (as weights).
|
||||
auto shapeSource = InferenceEngine::make_shared_blob<float>(
|
||||
InferenceEngine::Precision::FP32,
|
||||
InferenceEngine::Layout::ANY, outShape);
|
||||
auto shapeSource = InferenceEngine::make_shared_blob<float>({
|
||||
InferenceEngine::Precision::FP32, outShape,
|
||||
InferenceEngine::Layout::ANY
|
||||
});
|
||||
shapeSource->allocate();
|
||||
addConstantData("weights", shapeSource, ieLayer);
|
||||
}
|
||||
|
@ -329,7 +329,8 @@ public:
|
||||
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
|
||||
|
||||
InferenceEngine::Builder::SoftMaxLayer ieLayer(name);
|
||||
ieLayer.setAxis(clamp(axisRaw, input->dims.size()));
|
||||
ieLayer.setAxis(clamp(axisRaw, input->getDims().size()));
|
||||
|
||||
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
|
||||
}
|
||||
#endif // HAVE_INF_ENGINE
|
||||
|
@ -45,13 +45,13 @@ infEngineWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
|
||||
InfEngineBackendNet::InfEngineBackendNet() : netBuilder("")
|
||||
{
|
||||
hasNetOwner = false;
|
||||
targetDevice = InferenceEngine::TargetDevice::eCPU;
|
||||
device_name = "CPU";
|
||||
}
|
||||
|
||||
InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) : netBuilder(""), cnn(net)
|
||||
{
|
||||
hasNetOwner = true;
|
||||
targetDevice = InferenceEngine::TargetDevice::eCPU;
|
||||
device_name = "CPU";
|
||||
}
|
||||
|
||||
void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& inputs,
|
||||
@ -66,16 +66,13 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
|
||||
for (size_t i = 0; i < inpWrappers.size(); ++i)
|
||||
{
|
||||
const auto& inp = inpWrappers[i];
|
||||
const std::string& inpName = inp->dataPtr->name;
|
||||
const std::string& inpName = inp->dataPtr->getName();
|
||||
int inpId;
|
||||
it = layers.find(inpName);
|
||||
if (it == layers.end())
|
||||
{
|
||||
InferenceEngine::Builder::InputLayer inpLayer(!inpName.empty() ? inpName : kDefaultInpLayerName);
|
||||
|
||||
std::vector<size_t> shape(inp->blob->dims());
|
||||
std::reverse(shape.begin(), shape.end());
|
||||
|
||||
std::vector<size_t> shape(inp->blob->getTensorDesc().getDims());
|
||||
inpLayer.setPort(InferenceEngine::Port(shape));
|
||||
inpId = netBuilder.addLayer(inpLayer);
|
||||
|
||||
@ -89,7 +86,11 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
|
||||
}
|
||||
CV_Assert(!outputs.empty());
|
||||
InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]);
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
dataPtr->name = layerName;
|
||||
#else
|
||||
dataPtr->setName(layerName);
|
||||
#endif
|
||||
}
|
||||
|
||||
void InfEngineBackendNet::init(int targetId)
|
||||
@ -115,21 +116,22 @@ void InfEngineBackendNet::init(int targetId)
|
||||
|
||||
switch (targetId)
|
||||
{
|
||||
case DNN_TARGET_CPU:
|
||||
targetDevice = InferenceEngine::TargetDevice::eCPU;
|
||||
break;
|
||||
case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL_FP16:
|
||||
targetDevice = InferenceEngine::TargetDevice::eGPU;
|
||||
break;
|
||||
case DNN_TARGET_MYRIAD:
|
||||
targetDevice = InferenceEngine::TargetDevice::eMYRIAD;
|
||||
break;
|
||||
case DNN_TARGET_FPGA:
|
||||
targetDevice = InferenceEngine::TargetDevice::eFPGA;
|
||||
break;
|
||||
default:
|
||||
CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId));
|
||||
}
|
||||
case DNN_TARGET_CPU:
|
||||
device_name = "CPU";
|
||||
break;
|
||||
case DNN_TARGET_OPENCL:
|
||||
case DNN_TARGET_OPENCL_FP16:
|
||||
device_name = "GPU";
|
||||
break;
|
||||
case DNN_TARGET_MYRIAD:
|
||||
device_name = "MYRIAD";
|
||||
break;
|
||||
case DNN_TARGET_FPGA:
|
||||
device_name = "FPGA";
|
||||
break;
|
||||
default:
|
||||
CV_Error(Error::StsNotImplemented, "Unknown target");
|
||||
};
|
||||
|
||||
for (const auto& name : requestedOutputs)
|
||||
{
|
||||
@ -141,14 +143,14 @@ void InfEngineBackendNet::init(int targetId)
|
||||
const std::string& name = it.first;
|
||||
auto blobIt = allBlobs.find(name);
|
||||
CV_Assert(blobIt != allBlobs.end());
|
||||
it.second->setPrecision(blobIt->second->precision());
|
||||
it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision());
|
||||
}
|
||||
for (const auto& it : cnn.getOutputsInfo())
|
||||
{
|
||||
const std::string& name = it.first;
|
||||
auto blobIt = allBlobs.find(name);
|
||||
CV_Assert(blobIt != allBlobs.end());
|
||||
it.second->setPrecision(blobIt->second->precision()); // Should be always FP32
|
||||
it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision()); // Should be always FP32
|
||||
}
|
||||
|
||||
initPlugin(cnn);
|
||||
@ -223,16 +225,13 @@ static InferenceEngine::Layout estimateLayout(const Mat& m)
|
||||
|
||||
static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std::string& name = "")
|
||||
{
|
||||
std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
|
||||
std::reverse(reversedShape.begin(), reversedShape.end());
|
||||
std::vector<size_t> shape(&m.size[0], &m.size[0] + m.dims);
|
||||
if (m.type() == CV_32F)
|
||||
return InferenceEngine::DataPtr(
|
||||
new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, estimateLayout(m))
|
||||
);
|
||||
return InferenceEngine::DataPtr(new InferenceEngine::Data(name,
|
||||
{InferenceEngine::Precision::FP32, shape, estimateLayout(m)}));
|
||||
else if (m.type() == CV_8U)
|
||||
return InferenceEngine::DataPtr(
|
||||
new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::U8, estimateLayout(m))
|
||||
);
|
||||
return InferenceEngine::DataPtr(new InferenceEngine::Data(name,
|
||||
{InferenceEngine::Precision::U8, shape, estimateLayout(m)}));
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type()));
|
||||
}
|
||||
@ -241,33 +240,33 @@ InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<s
|
||||
InferenceEngine::Layout layout)
|
||||
{
|
||||
if (m.type() == CV_32F)
|
||||
return InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
|
||||
layout, shape, (float*)m.data);
|
||||
return InferenceEngine::make_shared_blob<float>(
|
||||
{InferenceEngine::Precision::FP32, shape, layout}, (float*)m.data);
|
||||
else if (m.type() == CV_8U)
|
||||
return InferenceEngine::make_shared_blob<uint8_t>(InferenceEngine::Precision::U8,
|
||||
layout, shape, (uint8_t*)m.data);
|
||||
return InferenceEngine::make_shared_blob<uint8_t>(
|
||||
{InferenceEngine::Precision::U8, shape, layout}, (uint8_t*)m.data);
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type()));
|
||||
}
|
||||
|
||||
InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout)
|
||||
{
|
||||
std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
|
||||
std::reverse(reversedShape.begin(), reversedShape.end());
|
||||
return wrapToInfEngineBlob(m, reversedShape, layout);
|
||||
std::vector<size_t> shape(&m.size[0], &m.size[0] + m.dims);
|
||||
return wrapToInfEngineBlob(m, shape, layout);
|
||||
}
|
||||
|
||||
InferenceEngine::Blob::Ptr cloneBlob(const InferenceEngine::Blob::Ptr& blob)
|
||||
{
|
||||
InferenceEngine::Precision precision = blob->precision();
|
||||
InferenceEngine::Blob::Ptr copy;
|
||||
auto description = blob->getTensorDesc();
|
||||
InferenceEngine::Precision precision = description.getPrecision();
|
||||
if (precision == InferenceEngine::Precision::FP32)
|
||||
{
|
||||
copy = InferenceEngine::make_shared_blob<float>(precision, blob->layout(), blob->dims());
|
||||
copy = InferenceEngine::make_shared_blob<float>(description);
|
||||
}
|
||||
else if (precision == InferenceEngine::Precision::U8)
|
||||
{
|
||||
copy = InferenceEngine::make_shared_blob<uint8_t>(precision, blob->layout(), blob->dims());
|
||||
copy = InferenceEngine::make_shared_blob<uint8_t>(description);
|
||||
}
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
|
||||
@ -296,10 +295,8 @@ InfEngineBackendWrapper::InfEngineBackendWrapper(Ptr<BackendWrapper> wrapper)
|
||||
Ptr<InfEngineBackendWrapper> ieWrapper = wrapper.dynamicCast<InfEngineBackendWrapper>();
|
||||
CV_Assert(!ieWrapper.empty());
|
||||
InferenceEngine::DataPtr srcData = ieWrapper->dataPtr;
|
||||
dataPtr = InferenceEngine::DataPtr(
|
||||
new InferenceEngine::Data(srcData->name, srcData->dims, srcData->precision,
|
||||
srcData->layout)
|
||||
);
|
||||
|
||||
dataPtr = InferenceEngine::DataPtr(new InferenceEngine::Data(srcData->getName(), srcData->getTensorDesc()));
|
||||
blob = ieWrapper->blob;
|
||||
}
|
||||
|
||||
@ -324,12 +321,19 @@ void InfEngineBackendWrapper::setHostDirty()
|
||||
}
|
||||
|
||||
|
||||
static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr>& getSharedPlugins()
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
static std::map<std::string, InferenceEngine::InferenceEnginePluginPtr>& getSharedPlugins()
|
||||
{
|
||||
static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
|
||||
static std::map<std::string, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
|
||||
return sharedPlugins;
|
||||
}
|
||||
|
||||
#else
|
||||
static InferenceEngine::Core& getCore()
|
||||
{
|
||||
static InferenceEngine::Core core;
|
||||
return core;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT)
|
||||
static bool detectMyriadX_()
|
||||
@ -362,24 +366,29 @@ static bool detectMyriadX_()
|
||||
InferenceEngine::CNNNetwork cnn = InferenceEngine::CNNNetwork(
|
||||
InferenceEngine::Builder::convertToICNNNetwork(builder.build()));
|
||||
|
||||
InferenceEngine::TargetDevice device = InferenceEngine::TargetDevice::eMYRIAD;
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
InferenceEngine::InferenceEnginePluginPtr enginePtr;
|
||||
{
|
||||
AutoLock lock(getInitializationMutex());
|
||||
auto& sharedPlugins = getSharedPlugins();
|
||||
auto pluginIt = sharedPlugins.find(device);
|
||||
auto pluginIt = sharedPlugins.find("MYRIAD");
|
||||
if (pluginIt != sharedPlugins.end()) {
|
||||
enginePtr = pluginIt->second;
|
||||
} else {
|
||||
auto dispatcher = InferenceEngine::PluginDispatcher({""});
|
||||
enginePtr = dispatcher.getSuitablePlugin(device);
|
||||
sharedPlugins[device] = enginePtr;
|
||||
enginePtr = dispatcher.getPluginByDevice("MYRIAD");
|
||||
sharedPlugins["MYRIAD"] = enginePtr;
|
||||
}
|
||||
}
|
||||
auto plugin = InferenceEngine::InferencePlugin(enginePtr);
|
||||
try
|
||||
{
|
||||
auto netExec = plugin.LoadNetwork(cnn, {{"VPU_PLATFORM", "VPU_2480"}});
|
||||
#else
|
||||
try
|
||||
{
|
||||
auto netExec = getCore().LoadNetwork(cnn, "MYRIAD", {{"VPU_PLATFORM", "VPU_2480"}});
|
||||
#endif
|
||||
auto infRequest = netExec.CreateInferRequest();
|
||||
} catch(...) {
|
||||
return false;
|
||||
@ -388,38 +397,41 @@ static bool detectMyriadX_()
|
||||
}
|
||||
#endif // !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT)
|
||||
|
||||
void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
|
||||
void InfEngineBackendNet::initPlugin(InferenceEngine::CNNNetwork& net)
|
||||
{
|
||||
CV_Assert(!isInitialized());
|
||||
|
||||
try
|
||||
{
|
||||
AutoLock lock(getInitializationMutex());
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
auto& sharedPlugins = getSharedPlugins();
|
||||
auto pluginIt = sharedPlugins.find(targetDevice);
|
||||
auto pluginIt = sharedPlugins.find(device_name);
|
||||
if (pluginIt != sharedPlugins.end())
|
||||
{
|
||||
enginePtr = pluginIt->second;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
auto dispatcher = InferenceEngine::PluginDispatcher({""});
|
||||
if (targetDevice == InferenceEngine::TargetDevice::eFPGA)
|
||||
if (device_name == "FPGA")
|
||||
enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
|
||||
else
|
||||
enginePtr = dispatcher.getSuitablePlugin(targetDevice);
|
||||
sharedPlugins[targetDevice] = enginePtr;
|
||||
|
||||
enginePtr = dispatcher.getPluginByDevice(device_name);
|
||||
sharedPlugins[device_name] = enginePtr;
|
||||
#else
|
||||
isInit = true;
|
||||
#endif
|
||||
std::vector<std::string> candidates;
|
||||
|
||||
std::string param_pluginPath = utils::getConfigurationParameterString("OPENCV_DNN_IE_EXTRA_PLUGIN_PATH", "");
|
||||
if (!param_pluginPath.empty())
|
||||
{
|
||||
candidates.push_back(param_pluginPath);
|
||||
}
|
||||
|
||||
if (targetDevice == InferenceEngine::TargetDevice::eCPU ||
|
||||
targetDevice == InferenceEngine::TargetDevice::eFPGA)
|
||||
if (device_name == "CPU" || device_name == "FPGA")
|
||||
{
|
||||
std::string suffixes[] = {"_avx2", "_sse4", ""};
|
||||
bool haveFeature[] = {
|
||||
@ -449,7 +461,12 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
|
||||
{
|
||||
InferenceEngine::IExtensionPtr extension =
|
||||
InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(libName);
|
||||
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
enginePtr->AddExtension(extension, 0);
|
||||
#else
|
||||
getCore().AddExtension(extension, "CPU");
|
||||
#endif
|
||||
CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << libName);
|
||||
found = true;
|
||||
break;
|
||||
@ -463,14 +480,24 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
|
||||
// Some of networks can work without a library of extra layers.
|
||||
#ifndef _WIN32
|
||||
// Limit the number of CPU threads.
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
enginePtr->SetConfig({{
|
||||
InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
|
||||
}}, 0);
|
||||
#else
|
||||
if (device_name == "CPU")
|
||||
getCore().SetConfig({{
|
||||
InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
|
||||
}}, device_name);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
plugin = InferenceEngine::InferencePlugin(enginePtr);
|
||||
|
||||
netExec = plugin.LoadNetwork(net, {});
|
||||
#else
|
||||
netExec = getCore().LoadNetwork(net, device_name);
|
||||
#endif
|
||||
}
|
||||
catch (const std::exception& ex)
|
||||
{
|
||||
@ -480,7 +507,11 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
|
||||
|
||||
bool InfEngineBackendNet::isInitialized()
|
||||
{
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
return (bool)enginePtr;
|
||||
#else
|
||||
return isInit;
|
||||
#endif
|
||||
}
|
||||
|
||||
void InfEngineBackendNet::addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs)
|
||||
@ -488,7 +519,7 @@ void InfEngineBackendNet::addBlobs(const std::vector<cv::Ptr<BackendWrapper> >&
|
||||
auto wrappers = infEngineWrappers(ptrs);
|
||||
for (const auto& wrapper : wrappers)
|
||||
{
|
||||
std::string name = wrapper->dataPtr->name;
|
||||
std::string name = wrapper->dataPtr->getName();
|
||||
name = name.empty() ? kDefaultInpLayerName : name;
|
||||
allBlobs.insert({name, wrapper->blob});
|
||||
}
|
||||
@ -503,7 +534,7 @@ void InfEngineBackendNet::InfEngineReqWrapper::makePromises(const std::vector<Pt
|
||||
for (int i = 0; i < outs.size(); ++i)
|
||||
{
|
||||
outs[i]->futureMat = outProms[i].getArrayResult();
|
||||
outsNames[i] = outs[i]->dataPtr->name;
|
||||
outsNames[i] = outs[i]->dataPtr->getName();
|
||||
}
|
||||
}
|
||||
|
||||
@ -627,11 +658,12 @@ void InfEngineBackendNet::forward(const std::vector<Ptr<BackendWrapper> >& outBl
|
||||
Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
|
||||
{
|
||||
// NOTE: Inference Engine sizes are reversed.
|
||||
std::vector<size_t> dims = blob->dims();
|
||||
std::vector<int> size(dims.rbegin(), dims.rend());
|
||||
std::vector<size_t> dims = blob->getTensorDesc().getDims();
|
||||
std::vector<int> size(dims.begin(), dims.end());
|
||||
auto precision = blob->getTensorDesc().getPrecision();
|
||||
|
||||
int type = -1;
|
||||
switch (blob->precision())
|
||||
switch (precision)
|
||||
{
|
||||
case InferenceEngine::Precision::FP32: type = CV_32F; break;
|
||||
case InferenceEngine::Precision::U8: type = CV_8U; break;
|
||||
@ -685,7 +717,10 @@ void InfEngineBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArra
|
||||
|
||||
InferenceEngine::Blob::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob)
|
||||
{
|
||||
auto halfs = InferenceEngine::make_shared_blob<int16_t>(InferenceEngine::Precision::FP16, blob->layout(), blob->dims());
|
||||
auto halfs = InferenceEngine::make_shared_blob<int16_t>({
|
||||
InferenceEngine::Precision::FP16, blob->getTensorDesc().getDims(),
|
||||
blob->getTensorDesc().getLayout()
|
||||
});
|
||||
halfs->allocate();
|
||||
Mat floatsData(1, blob->size(), CV_32F, blob->buffer());
|
||||
Mat halfsData(1, blob->size(), CV_16SC1, halfs->buffer());
|
||||
@ -732,7 +767,11 @@ void resetMyriadDevice()
|
||||
{
|
||||
#ifdef HAVE_INF_ENGINE
|
||||
AutoLock lock(getInitializationMutex());
|
||||
getSharedPlugins().erase(InferenceEngine::TargetDevice::eMYRIAD);
|
||||
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
|
||||
getSharedPlugins().erase("MYRIAD");
|
||||
#else
|
||||
getCore().UnregisterPlugin("MYRIAD");
|
||||
#endif
|
||||
#endif // HAVE_INF_ENGINE
|
||||
}
|
||||
|
||||
|
@ -92,18 +92,22 @@ public:
|
||||
void forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
|
||||
bool isAsync);
|
||||
|
||||
void initPlugin(InferenceEngine::ICNNNetwork& net);
|
||||
void initPlugin(InferenceEngine::CNNNetwork& net);
|
||||
|
||||
void addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs);
|
||||
|
||||
private:
|
||||
InferenceEngine::Builder::Network netBuilder;
|
||||
|
||||
InferenceEngine::InferenceEnginePluginPtr enginePtr;
|
||||
InferenceEngine::InferencePlugin plugin;
|
||||
InferenceEngine::ExecutableNetwork netExec;
|
||||
InferenceEngine::BlobMap allBlobs;
|
||||
InferenceEngine::TargetDevice targetDevice;
|
||||
std::string device_name;
|
||||
#if INF_ENGINE_VER_MAJOR_LE(2019010000)
|
||||
InferenceEngine::InferenceEnginePluginPtr enginePtr;
|
||||
InferenceEngine::InferencePlugin plugin;
|
||||
#else
|
||||
bool isInit = false;
|
||||
#endif
|
||||
|
||||
struct InfEngineReqWrapper
|
||||
{
|
||||
|
@ -136,13 +136,10 @@ static const std::vector<std::string> getOpenVINOTestModelsList()
|
||||
|
||||
static inline void genData(const std::vector<size_t>& dims, Mat& m, Blob::Ptr& dataPtr)
|
||||
{
|
||||
std::vector<int> reversedDims(dims.begin(), dims.end());
|
||||
std::reverse(reversedDims.begin(), reversedDims.end());
|
||||
|
||||
m.create(reversedDims, CV_32F);
|
||||
m.create(std::vector<int>(dims.begin(), dims.end()), CV_32F);
|
||||
randu(m, -1, 1);
|
||||
|
||||
dataPtr = make_shared_blob<float>(Precision::FP32, dims, (float*)m.data);
|
||||
dataPtr = make_shared_blob<float>({Precision::FP32, dims, Layout::ANY}, (float*)m.data);
|
||||
}
|
||||
|
||||
void runIE(Target target, const std::string& xmlPath, const std::string& binPath,
|
||||
@ -154,32 +151,42 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
|
||||
|
||||
CNNNetwork net = reader.getNetwork();
|
||||
|
||||
std::string device_name;
|
||||
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
|
||||
Core ie;
|
||||
#else
|
||||
InferenceEnginePluginPtr enginePtr;
|
||||
InferencePlugin plugin;
|
||||
#endif
|
||||
ExecutableNetwork netExec;
|
||||
InferRequest infRequest;
|
||||
|
||||
try
|
||||
{
|
||||
auto dispatcher = InferenceEngine::PluginDispatcher({""});
|
||||
switch (target)
|
||||
{
|
||||
case DNN_TARGET_CPU:
|
||||
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eCPU);
|
||||
device_name = "CPU";
|
||||
break;
|
||||
case DNN_TARGET_OPENCL:
|
||||
case DNN_TARGET_OPENCL_FP16:
|
||||
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eGPU);
|
||||
device_name = "GPU";
|
||||
break;
|
||||
case DNN_TARGET_MYRIAD:
|
||||
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eMYRIAD);
|
||||
device_name = "MYRIAD";
|
||||
break;
|
||||
case DNN_TARGET_FPGA:
|
||||
enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
|
||||
device_name = "FPGA";
|
||||
break;
|
||||
default:
|
||||
CV_Error(Error::StsNotImplemented, "Unknown target");
|
||||
};
|
||||
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
|
||||
auto dispatcher = InferenceEngine::PluginDispatcher({""});
|
||||
enginePtr = dispatcher.getPluginByDevice(device_name);
|
||||
#endif
|
||||
if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA)
|
||||
{
|
||||
std::string suffixes[] = {"_avx2", "_sse4", ""};
|
||||
@ -202,16 +209,23 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
|
||||
try
|
||||
{
|
||||
IExtensionPtr extension = make_so_pointer<IExtension>(libName);
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
|
||||
ie.AddExtension(extension, device_name);
|
||||
#else
|
||||
enginePtr->AddExtension(extension, 0);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
catch(...) {}
|
||||
}
|
||||
// Some of networks can work without a library of extra layers.
|
||||
}
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
|
||||
netExec = ie.LoadNetwork(net, device_name);
|
||||
#else
|
||||
plugin = InferencePlugin(enginePtr);
|
||||
|
||||
netExec = plugin.LoadNetwork(net, {});
|
||||
#endif
|
||||
infRequest = netExec.CreateInferRequest();
|
||||
}
|
||||
catch (const std::exception& ex)
|
||||
@ -224,7 +238,7 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
|
||||
BlobMap inputBlobs;
|
||||
for (auto& it : net.getInputsInfo())
|
||||
{
|
||||
genData(it.second->getDims(), inputsMap[it.first], inputBlobs[it.first]);
|
||||
genData(it.second->getTensorDesc().getDims(), inputsMap[it.first], inputBlobs[it.first]);
|
||||
}
|
||||
infRequest.SetInput(inputBlobs);
|
||||
|
||||
@ -233,7 +247,7 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
|
||||
BlobMap outputBlobs;
|
||||
for (auto& it : net.getOutputsInfo())
|
||||
{
|
||||
genData(it.second->dims, outputsMap[it.first], outputBlobs[it.first]);
|
||||
genData(it.second->getTensorDesc().getDims(), outputsMap[it.first], outputBlobs[it.first]);
|
||||
}
|
||||
infRequest.SetOutput(outputBlobs);
|
||||
|
||||
|
@ -469,6 +469,42 @@ INSTANTIATE_TEST_CASE_P(/**/, Async, Combine(
|
||||
Values(CV_32F, CV_8U),
|
||||
testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))
|
||||
));
|
||||
|
||||
typedef testing::TestWithParam<Target> Test_Model_Optimizer;
|
||||
TEST_P(Test_Model_Optimizer, forward_two_nets)
|
||||
{
|
||||
const int target = GetParam();
|
||||
|
||||
const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
|
||||
const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
|
||||
const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
|
||||
|
||||
Net net0 = readNet(model, proto);
|
||||
net0.setPreferableTarget(target);
|
||||
|
||||
Net net1 = readNet(model, proto);
|
||||
net1.setPreferableTarget(target);
|
||||
|
||||
// Generate inputs.
|
||||
int blobSize[] = {2, 6, 75, 113};
|
||||
Mat input(4, &blobSize[0], CV_32F);
|
||||
randu(input, 0, 255);
|
||||
|
||||
net0.setInput(input);
|
||||
Mat ref0 = net0.forward().clone();
|
||||
|
||||
net1.setInput(input);
|
||||
Mat ref1 = net1.forward();
|
||||
|
||||
net0.setInput(input);
|
||||
Mat ref2 = net0.forward();
|
||||
|
||||
normAssert(ref0, ref2, 0, 0);
|
||||
}
|
||||
INSTANTIATE_TEST_CASE_P(/**/, Test_Model_Optimizer,
|
||||
testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))
|
||||
);
|
||||
|
||||
#endif // HAVE_INF_ENGINE
|
||||
|
||||
}} // namespace
|
||||
|
@ -357,11 +357,9 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
|
||||
#if defined(INF_ENGINE_RELEASE)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
|
||||
{
|
||||
#if INF_ENGINE_VER_MAJOR_EQ(2019010000)
|
||||
#if INF_ENGINE_VER_MAJOR_GE(2019020000)
|
||||
if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
|
||||
#else
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
@ -395,16 +393,10 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
|
||||
TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
|
||||
{
|
||||
applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
|
||||
#if defined(INF_ENGINE_RELEASE)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
|
||||
{
|
||||
#if INF_ENGINE_VER_MAJOR_LE(2019010000)
|
||||
if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
|
||||
#else
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
|
||||
#endif
|
||||
}
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD &&
|
||||
getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
|
||||
#endif
|
||||
|
||||
checkBackend();
|
||||
@ -456,12 +448,13 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
|
||||
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3;
|
||||
|
||||
#if defined(INF_ENGINE_RELEASE)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
|
||||
&& getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
|
||||
)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD &&
|
||||
getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
|
||||
{
|
||||
scoreDiff = 0.061;
|
||||
iouDiff = 0.12;
|
||||
detectionConfThresh = 0.36;
|
||||
}
|
||||
#endif
|
||||
normAssertDetections(ref, out, "", detectionConfThresh, scoreDiff, iouDiff);
|
||||
expectNoFallbacksFromIE(net);
|
||||
|
@ -262,7 +262,7 @@ class Test_Torch_nets : public DNNTestLayer {};
|
||||
|
||||
TEST_P(Test_Torch_nets, OpenFace_accuracy)
|
||||
{
|
||||
#if defined(INF_ENGINE_RELEASE)
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
|
||||
#endif
|
||||
@ -287,8 +287,8 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy)
|
||||
|
||||
// Reference output values are in range [-0.17212, 0.263492]
|
||||
// on Myriad problem layer: l4_Pooling - does not use pads_begin
|
||||
float l1 = (target == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5;
|
||||
float lInf = (target == DNN_TARGET_OPENCL_FP16) ? 1.5e-3 : 1e-3;
|
||||
float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2e-3 : 1e-5;
|
||||
float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : 1e-3;
|
||||
Mat outRef = readTorchBlob(_tf("net_openface_output.dat"), true);
|
||||
normAssert(out, outRef, "", l1, lInf);
|
||||
}
|
||||
|
@ -98,7 +98,7 @@ core = {'': ['absdiff', 'add', 'addWeighted', 'bitwise_and', 'bitwise_not', 'bit
|
||||
'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen', \
|
||||
'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude', \
|
||||
'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize', \
|
||||
'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'setIdentity', 'setRNGSeed', \
|
||||
'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed', \
|
||||
'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat'],
|
||||
'Algorithm': []}
|
||||
|
||||
|
@ -941,4 +941,22 @@ QUnit.test('test_filter', function(assert) {
|
||||
inv3.delete();
|
||||
inv4.delete();
|
||||
}
|
||||
//Rotate
|
||||
{
|
||||
let dst = new cv.Mat();
|
||||
let src = cv.matFromArray(3, 2, cv.CV_8U, [1,2,3,4,5,6]);
|
||||
|
||||
cv.rotate(src, dst, cv.ROTATE_90_CLOCKWISE);
|
||||
|
||||
size = dst.size();
|
||||
assert.equal(size.height, 2, "ROTATE_HEIGHT");
|
||||
assert.equal(size.width, 3, "ROTATE_WIGTH");
|
||||
|
||||
let expected = new Uint8Array([5,3,1,6,4,2]);
|
||||
|
||||
assert.deepEqual(dst.data, expected);
|
||||
|
||||
dst.delete();
|
||||
src.delete();
|
||||
}
|
||||
});
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include "precomp.hpp"
|
||||
#include "cascadedetect.hpp"
|
||||
#include "opencv2/core/core_c.h"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
#include "opencl_kernels_objdetect.hpp"
|
||||
|
||||
#include <cstdio>
|
||||
@ -223,17 +224,6 @@ void HOGDescriptor::copyTo(HOGDescriptor& c) const
|
||||
c.signedGradient = signedGradient;
|
||||
}
|
||||
|
||||
#if CV_NEON
|
||||
// replace of _mm_set_ps
|
||||
inline float32x4_t vsetq_f32(float f0, float f1, float f2, float f3)
|
||||
{
|
||||
float32x4_t a = vdupq_n_f32(f0);
|
||||
a = vsetq_lane_f32(f1, a, 1);
|
||||
a = vsetq_lane_f32(f2, a, 2);
|
||||
a = vsetq_lane_f32(f3, a, 3);
|
||||
return a;
|
||||
}
|
||||
#endif
|
||||
void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, InputOutputArray _qangle,
|
||||
Size paddingTL, Size paddingBR) const
|
||||
{
|
||||
@ -259,38 +249,22 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
|
||||
Mat_<float> _lut(1, 256);
|
||||
const float* const lut = &_lut(0,0);
|
||||
#if CV_SSE2
|
||||
const int indices[] = { 0, 1, 2, 3 };
|
||||
__m128i idx = _mm_loadu_si128((const __m128i*)indices);
|
||||
__m128i ifour = _mm_set1_epi32(4);
|
||||
#if CV_SIMD128
|
||||
v_float32x4 idx(0.0f, 1.0f, 2.0f, 3.0f);
|
||||
v_float32x4 ifour = v_setall_f32(4.0);
|
||||
|
||||
float* const _data = &_lut(0, 0);
|
||||
if( gammaCorrection )
|
||||
for( i = 0; i < 256; i += 4 )
|
||||
if ( gammaCorrection )
|
||||
for ( i = 0; i < 256; i += 4)
|
||||
{
|
||||
_mm_storeu_ps(_data + i, _mm_sqrt_ps(_mm_cvtepi32_ps(idx)));
|
||||
idx = _mm_add_epi32(idx, ifour);
|
||||
v_store(_data + i, v_sqrt(idx));
|
||||
idx += ifour;
|
||||
}
|
||||
else
|
||||
for( i = 0; i < 256; i += 4 )
|
||||
for ( i = 0; i < 256; i += 4)
|
||||
{
|
||||
_mm_storeu_ps(_data + i, _mm_cvtepi32_ps(idx));
|
||||
idx = _mm_add_epi32(idx, ifour);
|
||||
}
|
||||
#elif CV_NEON
|
||||
const int indices[] = { 0, 1, 2, 3 };
|
||||
uint32x4_t idx = *(uint32x4_t*)indices;
|
||||
uint32x4_t ifour = vdupq_n_u32(4);
|
||||
|
||||
float* const _data = &_lut(0, 0);
|
||||
if( gammaCorrection )
|
||||
for( i = 0; i < 256; i++ )
|
||||
_lut(0,i) = std::sqrt((float)i);
|
||||
else
|
||||
for( i = 0; i < 256; i += 4 )
|
||||
{
|
||||
vst1q_f32(_data + i, vcvtq_f32_u32(idx));
|
||||
idx = vaddq_u32 (idx, ifour);
|
||||
v_store(_data + i, idx);
|
||||
idx += ifour;
|
||||
}
|
||||
#else
|
||||
if( gammaCorrection )
|
||||
@ -327,17 +301,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
{
|
||||
int end = gradsize.width + 2;
|
||||
xmap -= 1, x = 0;
|
||||
#if CV_SSE2
|
||||
#if CV_SIMD128
|
||||
for ( ; x <= end - 4; x += 4)
|
||||
{
|
||||
__m128i mul_res = _mm_loadu_si128((const __m128i*)(xmap + x));
|
||||
mul_res = _mm_add_epi32(_mm_add_epi32(mul_res, mul_res), mul_res); // multiply by 3
|
||||
_mm_storeu_si128((__m128i*)(xmap + x), mul_res);
|
||||
v_int32x4 mul_res = v_load(xmap + x);
|
||||
mul_res += mul_res + mul_res;
|
||||
v_store(xmap + x, mul_res);
|
||||
}
|
||||
#elif CV_NEON
|
||||
int32x4_t ithree = vdupq_n_s32(3);
|
||||
for ( ; x <= end - 4; x += 4)
|
||||
vst1q_s32(xmap + x, vmulq_s32(ithree, vld1q_s32(xmap + x)));
|
||||
#endif
|
||||
for ( ; x < end; ++x)
|
||||
xmap[x] *= 3;
|
||||
@ -368,7 +338,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
else
|
||||
{
|
||||
x = 0;
|
||||
#if CV_SSE2
|
||||
#if CV_SIMD128
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
|
||||
@ -378,73 +348,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
T p22 = imgPtr + xmap[x+3], p20 = p02;
|
||||
T p32 = imgPtr + xmap[x+4], p30 = p12;
|
||||
|
||||
__m128 _dx0 = _mm_sub_ps(_mm_set_ps(lut[p32[0]], lut[p22[0]], lut[p12[0]], lut[p02[0]]),
|
||||
_mm_set_ps(lut[p30[0]], lut[p20[0]], lut[p10[0]], lut[p00[0]]));
|
||||
__m128 _dx1 = _mm_sub_ps(_mm_set_ps(lut[p32[1]], lut[p22[1]], lut[p12[1]], lut[p02[1]]),
|
||||
_mm_set_ps(lut[p30[1]], lut[p20[1]], lut[p10[1]], lut[p00[1]]));
|
||||
__m128 _dx2 = _mm_sub_ps(_mm_set_ps(lut[p32[2]], lut[p22[2]], lut[p12[2]], lut[p02[2]]),
|
||||
_mm_set_ps(lut[p30[2]], lut[p20[2]], lut[p10[2]], lut[p00[2]]));
|
||||
v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) -
|
||||
v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]);
|
||||
v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) -
|
||||
v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]);
|
||||
v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) -
|
||||
v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]);
|
||||
|
||||
__m128 _dy0 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3]], lut[nextPtr[x2]], lut[nextPtr[x1]], lut[nextPtr[x0]]),
|
||||
_mm_set_ps(lut[prevPtr[x3]], lut[prevPtr[x2]], lut[prevPtr[x1]], lut[prevPtr[x0]]));
|
||||
__m128 _dy1 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+1]], lut[nextPtr[x2+1]], lut[nextPtr[x1+1]], lut[nextPtr[x0+1]]),
|
||||
_mm_set_ps(lut[prevPtr[x3+1]], lut[prevPtr[x2+1]], lut[prevPtr[x1+1]], lut[prevPtr[x0+1]]));
|
||||
__m128 _dy2 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+2]], lut[nextPtr[x2+2]], lut[nextPtr[x1+2]], lut[nextPtr[x0+2]]),
|
||||
_mm_set_ps(lut[prevPtr[x3+2]], lut[prevPtr[x2+2]], lut[prevPtr[x1+2]], lut[prevPtr[x0+2]]));
|
||||
v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) -
|
||||
v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]);
|
||||
v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) -
|
||||
v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]);
|
||||
v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) -
|
||||
v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]);
|
||||
|
||||
__m128 _mag0 = _mm_add_ps(_mm_mul_ps(_dx0, _dx0), _mm_mul_ps(_dy0, _dy0));
|
||||
__m128 _mag1 = _mm_add_ps(_mm_mul_ps(_dx1, _dx1), _mm_mul_ps(_dy1, _dy1));
|
||||
__m128 _mag2 = _mm_add_ps(_mm_mul_ps(_dx2, _dx2), _mm_mul_ps(_dy2, _dy2));
|
||||
v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
|
||||
v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
|
||||
v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2);
|
||||
|
||||
__m128 mask = _mm_cmpgt_ps(_mag2, _mag1);
|
||||
_dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx1));
|
||||
_dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy1));
|
||||
v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1);
|
||||
_dx2 = v_select(mask, _dx2, _dx1);
|
||||
_dy2 = v_select(mask, _dy2, _dy1);
|
||||
|
||||
mask = _mm_cmpgt_ps(_mm_max_ps(_mag2, _mag1), _mag0);
|
||||
_dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx0));
|
||||
_dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy0));
|
||||
mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0);
|
||||
_dx2 = v_select(mask, _dx2, _dx0);
|
||||
_dy2 = v_select(mask, _dy2, _dy0);
|
||||
|
||||
_mm_storeu_ps(dbuf + x, _dx2);
|
||||
_mm_storeu_ps(dbuf + x + width, _dy2);
|
||||
}
|
||||
#elif CV_NEON
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
|
||||
typedef const uchar* const T;
|
||||
T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1];
|
||||
T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x];
|
||||
T p22 = imgPtr + xmap[x+3], p20 = p02;
|
||||
T p32 = imgPtr + xmap[x+4], p30 = p12;
|
||||
|
||||
float32x4_t _dx0 = vsubq_f32(vsetq_f32(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]),
|
||||
vsetq_f32(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]));
|
||||
float32x4_t _dx1 = vsubq_f32(vsetq_f32(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]),
|
||||
vsetq_f32(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]));
|
||||
float32x4_t _dx2 = vsubq_f32(vsetq_f32(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]),
|
||||
vsetq_f32(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]));
|
||||
|
||||
float32x4_t _dy0 = vsubq_f32(vsetq_f32(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]),
|
||||
vsetq_f32(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]));
|
||||
float32x4_t _dy1 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]),
|
||||
vsetq_f32(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]));
|
||||
float32x4_t _dy2 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]),
|
||||
vsetq_f32(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]));
|
||||
|
||||
float32x4_t _mag0 = vaddq_f32(vmulq_f32(_dx0, _dx0), vmulq_f32(_dy0, _dy0));
|
||||
float32x4_t _mag1 = vaddq_f32(vmulq_f32(_dx1, _dx1), vmulq_f32(_dy1, _dy1));
|
||||
float32x4_t _mag2 = vaddq_f32(vmulq_f32(_dx2, _dx2), vmulq_f32(_dy2, _dy2));
|
||||
|
||||
uint32x4_t mask = vcgtq_f32(_mag2, _mag1);
|
||||
_dx2 = vbslq_f32(mask, _dx2, _dx1);
|
||||
_dy2 = vbslq_f32(mask, _dy2, _dy1);
|
||||
|
||||
mask = vcgtq_f32(vmaxq_f32(_mag2, _mag1), _mag0);
|
||||
_dx2 = vbslq_f32(mask, _dx2, _dx0);
|
||||
_dy2 = vbslq_f32(mask, _dy2, _dy0);
|
||||
|
||||
vst1q_f32(dbuf + x, _dx2);
|
||||
vst1q_f32(dbuf + x + width, _dy2);
|
||||
v_store(dbuf + x, _dx2);
|
||||
v_store(dbuf + x + width, _dy2);
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
@ -488,44 +419,40 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
|
||||
|
||||
// filling the result matrix
|
||||
x = 0;
|
||||
#if CV_SSE2
|
||||
__m128 fhalf = _mm_set1_ps(0.5f), fzero = _mm_setzero_ps();
|
||||
__m128 _angleScale = _mm_set1_ps(angleScale), fone = _mm_set1_ps(1.0f);
|
||||
__m128i ione = _mm_set1_epi32(1), _nbins = _mm_set1_epi32(nbins), izero = _mm_setzero_si128();
|
||||
#if CV_SIMD128
|
||||
v_float32x4 fhalf = v_setall_f32(0.5f);
|
||||
v_float32x4 _angleScale = v_setall_f32(angleScale), fone = v_setall_f32(1.0f);
|
||||
v_int32x4 ione = v_setall_s32(1), _nbins = v_setall_s32(nbins), izero = v_setzero_s32();
|
||||
|
||||
for ( ; x <= width - 4; x += 4)
|
||||
{
|
||||
int x2 = x << 1;
|
||||
__m128 _mag = _mm_loadu_ps(dbuf + x + (width << 1));
|
||||
__m128 _angle = _mm_loadu_ps(dbuf + x + width * 3);
|
||||
_angle = _mm_sub_ps(_mm_mul_ps(_angleScale, _angle), fhalf);
|
||||
v_float32x4 _mag = v_load(dbuf + x + (width << 1));
|
||||
v_float32x4 _angle = v_load(dbuf + x + width * 3);
|
||||
_angle = (_angleScale * _angle) - fhalf;
|
||||
|
||||
__m128 sign = _mm_and_ps(fone, _mm_cmplt_ps(_angle, fzero));
|
||||
__m128i _hidx = _mm_cvttps_epi32(_angle);
|
||||
_hidx = _mm_sub_epi32(_hidx, _mm_cvtps_epi32(sign));
|
||||
_angle = _mm_sub_ps(_angle, _mm_cvtepi32_ps(_hidx));
|
||||
v_int32x4 _hidx = v_floor(_angle);
|
||||
_angle -= v_cvt_f32(_hidx);
|
||||
|
||||
__m128 ft0 = _mm_mul_ps(_mag, _mm_sub_ps(fone, _angle));
|
||||
__m128 ft1 = _mm_mul_ps(_mag, _angle);
|
||||
__m128 ft2 = _mm_unpacklo_ps(ft0, ft1);
|
||||
__m128 ft3 = _mm_unpackhi_ps(ft0, ft1);
|
||||
v_float32x4 ft0 = _mag * (fone - _angle);
|
||||
v_float32x4 ft1 = _mag * _angle;
|
||||
|
||||
_mm_storeu_ps(gradPtr + x2, ft2);
|
||||
_mm_storeu_ps(gradPtr + x2 + 4, ft3);
|
||||
v_store_interleave(gradPtr + x2, ft0, ft1);
|
||||
|
||||
__m128i mask0 = _mm_sub_epi32(izero, _mm_srli_epi32(_hidx, 31));
|
||||
__m128i it0 = _mm_and_si128(mask0, _nbins);
|
||||
mask0 = _mm_cmplt_epi32(_hidx, _nbins);
|
||||
__m128i it1 = _mm_andnot_si128(mask0, _nbins);
|
||||
_hidx = _mm_add_epi32(_hidx, _mm_sub_epi32(it0, it1));
|
||||
v_int32x4 mask0 = _hidx >> 31;
|
||||
v_int32x4 it0 = mask0 & _nbins;
|
||||
mask0 = (_hidx >= _nbins);
|
||||
v_int32x4 it1 = mask0 & _nbins;
|
||||
_hidx += (it0 - it1);
|
||||
|
||||
it0 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
|
||||
_hidx = _mm_add_epi32(ione, _hidx);
|
||||
_hidx = _mm_and_si128(_hidx, _mm_cmplt_epi32(_hidx, _nbins));
|
||||
it1 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
|
||||
it0 = _mm_unpacklo_epi8(it0, it1);
|
||||
it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
|
||||
_hidx += ione;
|
||||
_hidx &= (_hidx < _nbins);
|
||||
it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
|
||||
v_uint8x16 it2, it3;
|
||||
v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3);
|
||||
|
||||
_mm_storel_epi64((__m128i*)(qanglePtr + x2), it0);
|
||||
v_store_low(qanglePtr + x2, it2);
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
@ -665,31 +592,17 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
|
||||
float bh = blockSize.height * 0.5f, bw = blockSize.width * 0.5f;
|
||||
|
||||
i = 0;
|
||||
#if CV_SSE2
|
||||
const int a[] = { 0, 1, 2, 3 };
|
||||
__m128i idx = _mm_loadu_si128((__m128i*)a);
|
||||
__m128 _bw = _mm_set1_ps(bw), _bh = _mm_set1_ps(bh);
|
||||
__m128i ifour = _mm_set1_epi32(4);
|
||||
#if CV_SIMD128
|
||||
v_float32x4 idx(0.0f, 1.0f, 2.0f, 3.0f);
|
||||
v_float32x4 _bw = v_setall_f32(bw), _bh = v_setall_f32(bh);
|
||||
v_float32x4 ifour = v_setall_f32(4.0);
|
||||
|
||||
for (; i <= blockSize.height - 4; i += 4)
|
||||
{
|
||||
__m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bh);
|
||||
t = _mm_mul_ps(t, t);
|
||||
idx = _mm_add_epi32(idx, ifour);
|
||||
_mm_storeu_ps(_di + i, t);
|
||||
}
|
||||
#elif CV_NEON
|
||||
const int a[] = { 0, 1, 2, 3 };
|
||||
int32x4_t idx = vld1q_s32(a);
|
||||
float32x4_t _bw = vdupq_n_f32(bw), _bh = vdupq_n_f32(bh);
|
||||
int32x4_t ifour = vdupq_n_s32(4);
|
||||
|
||||
for (; i <= blockSize.height - 4; i += 4)
|
||||
{
|
||||
float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bh);
|
||||
t = vmulq_f32(t, t);
|
||||
idx = vaddq_s32(idx, ifour);
|
||||
vst1q_f32(_di + i, t);
|
||||
v_float32x4 t = idx - _bh;
|
||||
t *= t;
|
||||
idx += ifour;
|
||||
v_store(_di + i, t);
|
||||
}
|
||||
#endif
|
||||
for ( ; i < blockSize.height; ++i)
|
||||
@ -699,23 +612,15 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
|
||||
}
|
||||
|
||||
j = 0;
|
||||
#if CV_SSE2
|
||||
idx = _mm_loadu_si128((__m128i*)a);
|
||||
for (; j <= blockSize.width - 4; j += 4)
|
||||
#if CV_SIMD128
|
||||
idx = v_float32x4(0.0f, 1.0f, 2.0f, 3.0f);
|
||||
|
||||
for (; j <= blockSize.height - 4; j += 4)
|
||||
{
|
||||
__m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bw);
|
||||
t = _mm_mul_ps(t, t);
|
||||
idx = _mm_add_epi32(idx, ifour);
|
||||
_mm_storeu_ps(_dj + j, t);
|
||||
}
|
||||
#elif CV_NEON
|
||||
idx = vld1q_s32(a);
|
||||
for (; j <= blockSize.width - 4; j += 4)
|
||||
{
|
||||
float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bw);
|
||||
t = vmulq_f32(t, t);
|
||||
idx = vaddq_s32(idx, ifour);
|
||||
vst1q_f32(_dj + j, t);
|
||||
v_float32x4 t = idx - _bw;
|
||||
t *= t;
|
||||
idx += ifour;
|
||||
v_store(_dj + j, t);
|
||||
}
|
||||
#endif
|
||||
for ( ; j < blockSize.width; ++j)
|
||||
@ -913,7 +818,7 @@ const float* HOGCache::getBlock(Point pt, float* buf)
|
||||
hist[h0] = t0; hist[h1] = t1;
|
||||
}
|
||||
|
||||
#if CV_SSE2
|
||||
#if CV_SIMD128
|
||||
float hist0[4], hist1[4];
|
||||
for( ; k < C2; k++ )
|
||||
{
|
||||
@ -922,12 +827,12 @@ const float* HOGCache::getBlock(Point pt, float* buf)
|
||||
const uchar* const h = qanglePtr + pk.qangleOfs;
|
||||
int h0 = h[0], h1 = h[1];
|
||||
|
||||
__m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]);
|
||||
__m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights));
|
||||
__m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w);
|
||||
v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
|
||||
v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
|
||||
v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
|
||||
|
||||
_mm_storeu_ps(hist0, _t0);
|
||||
_mm_storeu_ps(hist1, _t1);
|
||||
v_store(hist0, _t0);
|
||||
v_store(hist1, _t1);
|
||||
|
||||
float* hist = blockHist + pk.histOfs[0];
|
||||
float t0 = hist[h0] + hist0[0];
|
||||
@ -939,31 +844,6 @@ const float* HOGCache::getBlock(Point pt, float* buf)
|
||||
t1 = hist[h1] + hist1[1];
|
||||
hist[h0] = t0; hist[h1] = t1;
|
||||
}
|
||||
#elif CV_NEON
|
||||
float hist0[4], hist1[4];
|
||||
for( ; k < C2; k++ )
|
||||
{
|
||||
const PixData& pk = _pixData[k];
|
||||
const float* const a = gradPtr + pk.gradOfs;
|
||||
const uchar* const h = qanglePtr + pk.qangleOfs;
|
||||
int h0 = h[0], h1 = h[1];
|
||||
|
||||
float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]);
|
||||
float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights));
|
||||
|
||||
float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0], (blockHist + pk.histOfs[1])[h0], 0, 0);
|
||||
float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1], (blockHist + pk.histOfs[1])[h1], 0, 0);
|
||||
|
||||
float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w);
|
||||
vst1q_f32(hist0, _t0);
|
||||
vst1q_f32(hist1, _t1);
|
||||
|
||||
(blockHist + pk.histOfs[0])[h0] = hist0[0];
|
||||
(blockHist + pk.histOfs[1])[h0] = hist0[1];
|
||||
|
||||
(blockHist + pk.histOfs[0])[h1] = hist1[0];
|
||||
(blockHist + pk.histOfs[1])[h1] = hist1[1];
|
||||
}
|
||||
#else
|
||||
for( ; k < C2; k++ )
|
||||
{
|
||||
@ -987,7 +867,7 @@ const float* HOGCache::getBlock(Point pt, float* buf)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CV_SSE2
|
||||
#if CV_SIMD128
|
||||
for( ; k < C4; k++ )
|
||||
{
|
||||
const PixData& pk = _pixData[k];
|
||||
@ -995,12 +875,12 @@ const float* HOGCache::getBlock(Point pt, float* buf)
|
||||
const uchar* const h = qanglePtr + pk.qangleOfs;
|
||||
int h0 = h[0], h1 = h[1];
|
||||
|
||||
__m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]);
|
||||
__m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights));
|
||||
__m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w);
|
||||
v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
|
||||
v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
|
||||
v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
|
||||
|
||||
_mm_storeu_ps(hist0, _t0);
|
||||
_mm_storeu_ps(hist1, _t1);
|
||||
v_store(hist0, _t0);
|
||||
v_store(hist1, _t1);
|
||||
|
||||
float* hist = blockHist + pk.histOfs[0];
|
||||
float t0 = hist[h0] + hist0[0];
|
||||
@ -1021,62 +901,6 @@ const float* HOGCache::getBlock(Point pt, float* buf)
|
||||
t0 = hist[h0] + hist0[3];
|
||||
t1 = hist[h1] + hist1[3];
|
||||
hist[h0] = t0; hist[h1] = t1;
|
||||
|
||||
// __m128 _hist0 = _mm_set_ps((blockHist + pk.histOfs[3])[h0], (blockHist + pk.histOfs[2])[h0],
|
||||
// (blockHist + pk.histOfs[1])[h0], (blockHist + pk.histOfs[0])[h0]);
|
||||
// __m128 _hist1 = _mm_set_ps((blockHist + pk.histOfs[3])[h1], (blockHist + pk.histOfs[2])[h1],
|
||||
// (blockHist + pk.histOfs[1])[h1], (blockHist + pk.histOfs[0])[h1]);
|
||||
//
|
||||
// _hist0 = _mm_add_ps(_t0, _hist0);
|
||||
// _hist1 = _mm_add_ps(_t1, _hist1);
|
||||
//
|
||||
// _mm_storeu_ps(hist0, _hist0);
|
||||
// _mm_storeu_ps(hist1, _hist1);
|
||||
//
|
||||
// (pk.histOfs[0] + blockHist)[h0] = hist0[0];
|
||||
// (pk.histOfs[1] + blockHist)[h0] = hist0[1];
|
||||
// (pk.histOfs[2] + blockHist)[h0] = hist0[2];
|
||||
// (pk.histOfs[3] + blockHist)[h0] = hist0[3];
|
||||
//
|
||||
// (pk.histOfs[0] + blockHist)[h1] = hist1[0];
|
||||
// (pk.histOfs[1] + blockHist)[h1] = hist1[1];
|
||||
// (pk.histOfs[2] + blockHist)[h1] = hist1[2];
|
||||
// (pk.histOfs[3] + blockHist)[h1] = hist1[3];
|
||||
}
|
||||
#elif CV_NEON
|
||||
for( ; k < C4; k++ )
|
||||
{
|
||||
const PixData& pk = _pixData[k];
|
||||
const float* const a = gradPtr + pk.gradOfs;
|
||||
const uchar* const h = qanglePtr + pk.qangleOfs;
|
||||
int h0 = h[0], h1 = h[1];
|
||||
|
||||
float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]);
|
||||
float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights));
|
||||
|
||||
float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0],
|
||||
(blockHist + pk.histOfs[1])[h0],
|
||||
(blockHist + pk.histOfs[2])[h0],
|
||||
(blockHist + pk.histOfs[3])[h0]);
|
||||
float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1],
|
||||
(blockHist + pk.histOfs[1])[h1],
|
||||
(blockHist + pk.histOfs[2])[h1],
|
||||
(blockHist + pk.histOfs[3])[h1]);
|
||||
|
||||
|
||||
float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w);
|
||||
vst1q_f32(hist0, _t0);
|
||||
vst1q_f32(hist1, _t1);
|
||||
|
||||
(blockHist + pk.histOfs[0])[h0] = hist0[0];
|
||||
(blockHist + pk.histOfs[1])[h0] = hist0[1];
|
||||
(blockHist + pk.histOfs[2])[h0] = hist0[2];
|
||||
(blockHist + pk.histOfs[3])[h0] = hist0[3];
|
||||
|
||||
(blockHist + pk.histOfs[0])[h1] = hist1[0];
|
||||
(blockHist + pk.histOfs[1])[h1] = hist1[1];
|
||||
(blockHist + pk.histOfs[2])[h1] = hist1[2];
|
||||
(blockHist + pk.histOfs[3])[h1] = hist1[3];
|
||||
}
|
||||
#else
|
||||
for( ; k < C4; k++ )
|
||||
@ -1123,26 +947,16 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
|
||||
float* hist = &_hist[0], sum = 0.0f, partSum[4];
|
||||
size_t i = 0, sz = blockHistogramSize;
|
||||
|
||||
#if CV_SSE2
|
||||
__m128 p0 = _mm_loadu_ps(hist);
|
||||
__m128 s = _mm_mul_ps(p0, p0);
|
||||
#if CV_SIMD128
|
||||
v_float32x4 p0 = v_load(hist);
|
||||
v_float32x4 s = p0 * p0;
|
||||
|
||||
for (i = 4; i <= sz - 4; i += 4)
|
||||
{
|
||||
p0 = _mm_loadu_ps(hist + i);
|
||||
s = _mm_add_ps(s, _mm_mul_ps(p0, p0));
|
||||
p0 = v_load(hist + i);
|
||||
s += p0 * p0;
|
||||
}
|
||||
_mm_storeu_ps(partSum, s);
|
||||
#elif CV_NEON
|
||||
float32x4_t p0 = vld1q_f32(hist);
|
||||
float32x4_t s = vmulq_f32(p0, p0);
|
||||
|
||||
for (i = 4; i <= sz - 4; i += 4)
|
||||
{
|
||||
p0 = vld1q_f32(hist + i);
|
||||
s = vaddq_f32(s, vmulq_f32(p0, p0));
|
||||
}
|
||||
vst1q_f32(partSum, s);
|
||||
v_store(partSum, s);
|
||||
#else
|
||||
partSum[0] = 0.0f;
|
||||
partSum[1] = 0.0f;
|
||||
@ -1165,44 +979,25 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
|
||||
float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold;
|
||||
i = 0, sum = 0.0f;
|
||||
|
||||
#if CV_SSE2
|
||||
__m128 _scale = _mm_set1_ps(scale);
|
||||
static __m128 _threshold = _mm_set1_ps(thresh);
|
||||
#if CV_SIMD128
|
||||
v_float32x4 _scale = v_setall_f32(scale);
|
||||
static v_float32x4 _threshold = v_setall_f32(thresh);
|
||||
|
||||
__m128 p = _mm_mul_ps(_scale, _mm_loadu_ps(hist));
|
||||
p = _mm_min_ps(p, _threshold);
|
||||
s = _mm_mul_ps(p, p);
|
||||
_mm_storeu_ps(hist, p);
|
||||
v_float32x4 p = _scale * v_load(hist);
|
||||
p = v_min(p, _threshold);
|
||||
s = p * p;
|
||||
v_store(hist, p);
|
||||
|
||||
for(i = 4 ; i <= sz - 4; i += 4)
|
||||
{
|
||||
p = _mm_loadu_ps(hist + i);
|
||||
p = _mm_mul_ps(p, _scale);
|
||||
p = _mm_min_ps(p, _threshold);
|
||||
s = _mm_add_ps(s, _mm_mul_ps(p, p));
|
||||
_mm_storeu_ps(hist + i, p);
|
||||
p = v_load(hist + i);
|
||||
p *= _scale;
|
||||
p = v_min(p, _threshold);
|
||||
s += p * p;
|
||||
v_store(hist + i, p);
|
||||
}
|
||||
|
||||
_mm_storeu_ps(partSum, s);
|
||||
#elif CV_NEON
|
||||
float32x4_t _scale = vdupq_n_f32(scale);
|
||||
static float32x4_t _threshold = vdupq_n_f32(thresh);
|
||||
|
||||
float32x4_t p = vmulq_f32(_scale, vld1q_f32(hist));
|
||||
p = vminq_f32(p, _threshold);
|
||||
s = vmulq_f32(p, p);
|
||||
vst1q_f32(hist, p);
|
||||
|
||||
for(i = 4 ; i <= sz - 4; i += 4)
|
||||
{
|
||||
p = vld1q_f32(hist + i);
|
||||
p = vmulq_f32(p, _scale);
|
||||
p = vminq_f32(p, _threshold);
|
||||
s = vaddq_f32(s, vmulq_f32(p, p));
|
||||
vst1q_f32(hist + i, p);
|
||||
}
|
||||
|
||||
vst1q_f32(partSum, s);
|
||||
v_store(partSum, s);
|
||||
#else
|
||||
partSum[0] = 0.0f;
|
||||
partSum[1] = 0.0f;
|
||||
@ -1230,19 +1025,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
|
||||
}
|
||||
|
||||
scale = 1.f/(std::sqrt(sum)+1e-3f), i = 0;
|
||||
#if CV_SSE2
|
||||
__m128 _scale2 = _mm_set1_ps(scale);
|
||||
#if CV_SIMD128
|
||||
v_float32x4 _scale2 = v_setall_f32(scale);
|
||||
for ( ; i <= sz - 4; i += 4)
|
||||
{
|
||||
__m128 t = _mm_mul_ps(_scale2, _mm_loadu_ps(hist + i));
|
||||
_mm_storeu_ps(hist + i, t);
|
||||
}
|
||||
#elif CV_NEON
|
||||
float32x4_t _scale2 = vdupq_n_f32(scale);
|
||||
for ( ; i <= sz - 4; i += 4)
|
||||
{
|
||||
float32x4_t t = vmulq_f32(_scale2, vld1q_f32(hist + i));
|
||||
vst1q_f32(hist + i, t);
|
||||
v_float32x4 t = _scale2 * v_load(hist + i);
|
||||
v_store(hist + i, t);
|
||||
}
|
||||
#endif
|
||||
for ( ; i < sz; ++i)
|
||||
@ -1690,7 +1478,7 @@ void HOGDescriptor::detect(InputArray _img,
|
||||
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
|
||||
std::vector<float> blockHist(blockHistogramSize);
|
||||
|
||||
#if CV_SSE2 || CV_NEON
|
||||
#if CV_SIMD128
|
||||
float partSum[4];
|
||||
#endif
|
||||
|
||||
@ -1719,37 +1507,20 @@ void HOGDescriptor::detect(InputArray _img,
|
||||
Point pt = pt0 + bj.imgOffset;
|
||||
|
||||
const float* vec = cache.getBlock(pt, &blockHist[0]);
|
||||
#if CV_SSE2
|
||||
__m128 _vec = _mm_loadu_ps(vec);
|
||||
__m128 _svmVec = _mm_loadu_ps(svmVec);
|
||||
__m128 sum = _mm_mul_ps(_svmVec, _vec);
|
||||
#if CV_SIMD128
|
||||
v_float32x4 _vec = v_load(vec);
|
||||
v_float32x4 _svmVec = v_load(svmVec);
|
||||
v_float32x4 sum = _svmVec * _vec;
|
||||
|
||||
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
|
||||
{
|
||||
_vec = _mm_loadu_ps(vec + k);
|
||||
_svmVec = _mm_loadu_ps(svmVec + k);
|
||||
_vec = v_load(vec + k);
|
||||
_svmVec = v_load(svmVec + k);
|
||||
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec));
|
||||
sum += _vec * _svmVec;
|
||||
}
|
||||
|
||||
_mm_storeu_ps(partSum, sum);
|
||||
double t0 = partSum[0] + partSum[1];
|
||||
double t1 = partSum[2] + partSum[3];
|
||||
s += t0 + t1;
|
||||
#elif CV_NEON
|
||||
float32x4_t _vec = vld1q_f32(vec);
|
||||
float32x4_t _svmVec = vld1q_f32(svmVec);
|
||||
float32x4_t sum = vmulq_f32(_svmVec, _vec);
|
||||
|
||||
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
|
||||
{
|
||||
_vec = vld1q_f32(vec + k);
|
||||
_svmVec = vld1q_f32(svmVec + k);
|
||||
|
||||
sum = vaddq_f32(sum, vmulq_f32(_vec, _svmVec));
|
||||
}
|
||||
|
||||
vst1q_f32(partSum, sum);
|
||||
v_store(partSum, sum);
|
||||
double t0 = partSum[0] + partSum[1];
|
||||
double t1 = partSum[2] + partSum[3];
|
||||
s += t0 + t1;
|
||||
@ -3530,7 +3301,7 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
|
||||
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
|
||||
std::vector<float> blockHist(blockHistogramSize);
|
||||
|
||||
#if CV_SSE2 || CV_NEON
|
||||
#if CV_SIMD128
|
||||
float partSum[4];
|
||||
#endif
|
||||
|
||||
@ -3557,37 +3328,21 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
|
||||
|
||||
// need to divide this into 4 parts!
|
||||
const float* vec = cache.getBlock(pt, &blockHist[0]);
|
||||
#if CV_SSE2
|
||||
__m128 _vec = _mm_loadu_ps(vec);
|
||||
__m128 _svmVec = _mm_loadu_ps(svmVec);
|
||||
__m128 sum = _mm_mul_ps(_svmVec, _vec);
|
||||
#if CV_SIMD128
|
||||
v_float32x4 _vec = v_load(vec);
|
||||
v_float32x4 _svmVec = v_load(svmVec);
|
||||
v_float32x4 sum = _svmVec * _vec;
|
||||
|
||||
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
|
||||
{
|
||||
_vec = _mm_loadu_ps(vec + k);
|
||||
_svmVec = _mm_loadu_ps(svmVec + k);
|
||||
_vec = v_load(vec + k);
|
||||
_svmVec = v_load(svmVec + k);
|
||||
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec));
|
||||
sum += _vec * _svmVec;
|
||||
}
|
||||
|
||||
_mm_storeu_ps(partSum, sum);
|
||||
double t0 = partSum[0] + partSum[1];
|
||||
double t1 = partSum[2] + partSum[3];
|
||||
s += t0 + t1;
|
||||
#elif CV_NEON
|
||||
float32x4_t _vec = vld1q_f32(vec);
|
||||
float32x4_t _svmVec = vld1q_f32(svmVec);
|
||||
float32x4_t sum = vmulq_f32(_svmVec, _vec);
|
||||
v_store(partSum, sum);
|
||||
|
||||
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
|
||||
{
|
||||
_vec = vld1q_f32(vec + k);
|
||||
_svmVec = vld1q_f32(svmVec + k);
|
||||
|
||||
sum = vaddq_f32(sum, vmulq_f32(_vec, _svmVec));
|
||||
}
|
||||
|
||||
vst1q_f32(partSum, sum);
|
||||
double t0 = partSum[0] + partSum[1];
|
||||
double t1 = partSum[2] + partSum[3];
|
||||
s += t0 + t1;
|
||||
|
@ -1,7 +1,7 @@
|
||||
set(the_description "Images stitching")
|
||||
|
||||
if(HAVE_CUDA)
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow)
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wstrict-aliasing)
|
||||
endif()
|
||||
|
||||
set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d")
|
||||
|
@ -499,7 +499,7 @@ struct CvCapture_FFMPEG
|
||||
|
||||
double r2d(AVRational r) const;
|
||||
int64_t dts_to_frame_number(int64_t dts);
|
||||
double dts_to_sec(int64_t dts);
|
||||
double dts_to_sec(int64_t dts) const;
|
||||
|
||||
AVFormatContext * ic;
|
||||
AVCodec * avcodec;
|
||||
@ -892,7 +892,14 @@ bool CvCapture_FFMPEG::open( const char* _filename )
|
||||
#else
|
||||
av_dict_set(&dict, "rtsp_transport", "tcp", 0);
|
||||
#endif
|
||||
int err = avformat_open_input(&ic, _filename, NULL, &dict);
|
||||
AVInputFormat* input_format = NULL;
|
||||
AVDictionaryEntry* entry = av_dict_get(dict, "input_format", NULL, 0);
|
||||
if (entry != 0)
|
||||
{
|
||||
input_format = av_find_input_format(entry->value);
|
||||
}
|
||||
|
||||
int err = avformat_open_input(&ic, _filename, input_format, &dict);
|
||||
#else
|
||||
int err = av_open_input_file(&ic, _filename, NULL, 0, NULL);
|
||||
#endif
|
||||
@ -1168,7 +1175,11 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
|
||||
switch( property_id )
|
||||
{
|
||||
case CAP_PROP_POS_MSEC:
|
||||
return 1000.0*(double)frame_number/get_fps();
|
||||
if (picture_pts == AV_NOPTS_VALUE_)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
return (dts_to_sec(picture_pts) * 1000);
|
||||
case CAP_PROP_POS_FRAMES:
|
||||
return (double)frame_number;
|
||||
case CAP_PROP_POS_AVI_RATIO:
|
||||
@ -1278,7 +1289,7 @@ int64_t CvCapture_FFMPEG::dts_to_frame_number(int64_t dts)
|
||||
return (int64_t)(get_fps() * sec + 0.5);
|
||||
}
|
||||
|
||||
double CvCapture_FFMPEG::dts_to_sec(int64_t dts)
|
||||
double CvCapture_FFMPEG::dts_to_sec(int64_t dts) const
|
||||
{
|
||||
return (double)(dts - ic->streams[video_stream]->start_time) *
|
||||
r2d(ic->streams[video_stream]->time_base);
|
||||
|
@ -796,11 +796,10 @@ bool CvCaptureCAM_V4L::open(int _index)
|
||||
name = cv::format("/dev/video%d", _index);
|
||||
}
|
||||
|
||||
/* Print the CameraNumber at the end of the string with a width of one character */
|
||||
bool res = open(name.c_str());
|
||||
if (!res)
|
||||
{
|
||||
fprintf(stderr, "VIDEOIO ERROR: V4L: can't open camera by index %d\n", _index);
|
||||
CV_LOG_WARNING(NULL, cv::format("VIDEOIO ERROR: V4L: can't open camera by index %d", _index));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ public:
|
||||
{
|
||||
if (!videoio_registry::hasBackend(apiPref))
|
||||
throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
|
||||
if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265"))
|
||||
if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265" || ext == "mpg"))
|
||||
throw SkipTestException("Unstable MSMF test");
|
||||
writeVideo();
|
||||
VideoCapture cap;
|
||||
@ -172,7 +172,7 @@ public:
|
||||
{
|
||||
if (!videoio_registry::hasBackend(apiPref))
|
||||
throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
|
||||
if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265"))
|
||||
if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265" || ext == "mpg"))
|
||||
throw SkipTestException("Unstable MSMF test");
|
||||
VideoCapture cap;
|
||||
EXPECT_NO_THROW(cap.open(video_file, apiPref));
|
||||
|
Loading…
Reference in New Issue
Block a user