mirror of
https://github.com/opencv/opencv.git
synced 2025-01-13 00:01:27 +08:00
613c12e590
CUDA backend for the DNN module * stub cuda4dnn design * minor fixes for tests and doxygen * add csl public api directory to module headers * add low-level CSL components * add high-level CSL components * integrate csl::Tensor into backbone code * switch to CPU iff unsupported; otherwise, fail on error * add fully connected layer * add softmax layer * add activation layers * support arbitary rank TensorDescriptor * pass input wrappers to `initCUDA()` * add 1d/2d/3d-convolution * add pooling layer * reorganize and refactor code * fixes for gcc, clang and doxygen; remove cxx14/17 code * add blank_layer * add LRN layer * add rounding modes for pooling layer * split tensor.hpp into tensor.hpp and tensor_ops.hpp * add concat layer * add scale layer * add batch normalization layer * split math.cu into activations.cu and math.hpp * add eltwise layer * add flatten layer * add tensor transform api * add asymmetric padding support for convolution layer * add reshape layer * fix rebase issues * add permute layer * add padding support for concat layer * refactor and reorganize code * add normalize layer * optimize bias addition in scale layer * add prior box layer * fix and optimize normalize layer * add asymmetric padding support for pooling layer * add event API * improve pooling performance for some padding scenarios * avoid over-allocation of compute resources to kernels * improve prior box performance * enable layer fusion * add const layer * add resize layer * add slice layer * add padding layer * add deconvolution layer * fix channelwise ReLU initialization * add vector traits * add vectorized versions of relu, clipped_relu, power * add vectorized concat kernels * improve concat_with_offsets performance * vectorize scale and bias kernels * add support for multi-billion element tensors * vectorize prior box kernels * fix address alignment check * improve bias addition performance of conv/deconv/fc layers * restructure code for supporting multiple targets * add DNN_TARGET_CUDA_FP64 * add DNN_TARGET_FP16 * improve vectorization * add region layer * improve tensor API, add dynamic ranks 1. use ManagedPtr instead of a Tensor in backend wrapper 2. add new methods to tensor classes - size_range: computes the combined size of for a given axis range - tensor span/view can be constructed from a raw pointer and shape 3. the tensor classes can change their rank at runtime (previously rank was fixed at compile-time) 4. remove device code from tensor classes (as they are unused) 5. enforce strict conditions on tensor class APIs to improve debugging ability * fix parametric relu activation * add squeeze/unsqueeze tensor API * add reorg layer * optimize permute and enable 2d permute * enable 1d and 2d slice * add split layer * add shuffle channel layer * allow tensors of different ranks in reshape primitive * patch SliceOp to allow Crop Layer * allow extra shape inputs in reshape layer * use `std::move_backward` instead of `std::move` for insert in resizable_static_array * improve workspace management * add spatial LRN * add nms (cpu) to region layer * add max pooling with argmax ( and a fix to limits.hpp) * add max unpooling layer * rename DNN_TARGET_CUDA_FP32 to DNN_TARGET_CUDA * update supportBackend to be more rigorous * remove stray include from preventing non-cuda build * include op_cuda.hpp outside condition #if * refactoring, fixes and many optimizations * drop DNN_TARGET_CUDA_FP64 * fix gcc errors * increase max. tensor rank limit to six * add Interp layer * drop custom layers; use BackendNode * vectorize activation kernels * fixes for gcc * remove wrong assertion * fix broken assertion in unpooling primitive * fix build errors in non-CUDA build * completely remove workspace from public API * fix permute layer * enable accuracy and perf. tests for DNN_TARGET_CUDA * add asynchronous forward * vectorize eltwise ops * vectorize fill kernel * fixes for gcc * remove CSL headers from public API * remove csl header source group from cmake * update min. cudnn version in cmake * add numerically stable FP32 log1pexp * refactor code * add FP16 specialization to cudnn based tensor addition * vectorize scale1 and bias1 + minor refactoring * fix doxygen build * fix invalid alignment assertion * clear backend wrappers before allocateLayers * ignore memory lock failures * do not allocate internal blobs * integrate NVTX * add numerically stable half precision log1pexp * fix indentation, following coding style, improve docs * remove accidental modification of IE code * Revert "add asynchronous forward" This reverts commit 1154b9da9da07e9b52f8a81bdcea48cf31c56f70. * [cmake] throw error for unsupported CC versions * fix rebase issues * add more docs, refactor code, fix bugs * minor refactoring and fixes * resolve warnings/errors from clang * remove haveCUDA() checks from supportBackend() * remove NVTX integration * changes based on review comments * avoid exception when no CUDA device is present * add color code for CUDA in Net::dump
142 lines
5.4 KiB
CMake
142 lines
5.4 KiB
CMake
if(WINRT)
|
|
ocv_module_disable(dnn)
|
|
endif()
|
|
|
|
if(NOT HAVE_PROTOBUF)
|
|
ocv_module_disable(opencv_dnn)
|
|
endif()
|
|
|
|
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
|
|
|
|
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX)
|
|
|
|
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java js)
|
|
|
|
ocv_option(OPENCV_DNN_OPENCL "Build with OpenCL support" HAVE_OPENCL AND NOT APPLE)
|
|
|
|
if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
|
|
add_definitions(-DCV_OCL4DNN=1)
|
|
endif()
|
|
|
|
ocv_option(OPENCV_DNN_CUDA "Build with CUDA support" HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
|
|
|
|
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
|
|
add_definitions(-DCV_CUDA4DNN=1)
|
|
endif()
|
|
|
|
ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
|
|
|
|
if(MSVC)
|
|
add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
|
|
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
|
|
/wd4305 /wd4127 /wd4100 /wd4512 /wd4125 /wd4389 /wd4510 /wd4610
|
|
/wd4702 /wd4456 /wd4457 /wd4065 /wd4310 /wd4661 /wd4506
|
|
)
|
|
else()
|
|
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-deprecated -Wmissing-prototypes -Wmissing-declarations -Wshadow
|
|
-Wunused-parameter -Wsign-compare
|
|
)
|
|
endif()
|
|
if(HAVE_CUDA)
|
|
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
|
|
endif()
|
|
if(NOT HAVE_CXX11)
|
|
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-undef) # LANG_CXX11 from protobuf files
|
|
endif()
|
|
|
|
if(APPLE_FRAMEWORK)
|
|
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshorten-64-to-32)
|
|
endif()
|
|
|
|
if(ANDROID)
|
|
add_definitions(-DDISABLE_POSIX_MEMALIGN -DTH_DISABLE_HEAP_TRACKING)
|
|
endif()
|
|
|
|
if(NOT BUILD_PROTOBUF)
|
|
add_definitions(-DOPENCV_DNN_EXTERNAL_PROTOBUF=1)
|
|
endif()
|
|
|
|
add_definitions(-DHAVE_PROTOBUF=1)
|
|
|
|
#suppress warnings in autogenerated caffe.pb.* files
|
|
ocv_warnings_disable(CMAKE_CXX_FLAGS
|
|
/wd4125 /wd4267 /wd4127 /wd4244 /wd4512 /wd4702
|
|
/wd4456 /wd4510 /wd4610 /wd4800
|
|
/wd4701 /wd4703 # potentially uninitialized local/pointer variable 'value' used
|
|
/wd4505 # unreferenced local function has been removed
|
|
-wd858 -wd2196
|
|
-Winvalid-offsetof # Apple Clang (attr_value.pb.cc)
|
|
)
|
|
|
|
if(PROTOBUF_UPDATE_FILES)
|
|
file(GLOB proto_files "${CMAKE_CURRENT_LIST_DIR}/src/tensorflow/*.proto" "${CMAKE_CURRENT_LIST_DIR}/src/caffe/opencv-caffe.proto" "${CMAKE_CURRENT_LIST_DIR}/src/onnx/opencv-onnx.proto")
|
|
set(PROTOBUF_GENERATE_CPP_APPEND_PATH ON) # required for tensorflow
|
|
protobuf_generate_cpp(fw_srcs fw_hdrs ${proto_files})
|
|
else()
|
|
file(GLOB fw_srcs "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow/*.cc" "${CMAKE_CURRENT_LIST_DIR}/misc/caffe/opencv-caffe.pb.cc" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx/opencv-onnx.pb.cc")
|
|
file(GLOB fw_hdrs "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow/*.h" "${CMAKE_CURRENT_LIST_DIR}/misc/caffe/opencv-caffe.pb.h" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx/opencv-onnx.pb.h")
|
|
set(fw_inc "${CMAKE_CURRENT_LIST_DIR}/misc/caffe" "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx")
|
|
endif()
|
|
|
|
set(include_dirs ${fw_inc})
|
|
set(sources_options "")
|
|
set(libs libprotobuf ${LAPACK_LIBRARIES})
|
|
|
|
if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
|
|
list(APPEND include_dirs ${OPENCL_INCLUDE_DIRS})
|
|
else()
|
|
set(sources_options EXCLUDE_OPENCL)
|
|
endif()
|
|
|
|
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
|
|
list(APPEND include_dirs ${CUDA_TOOLKIT_INCLUDE} ${CUDNN_INCLUDE_DIRS})
|
|
set(CC_LIST ${CUDA_ARCH_BIN})
|
|
separate_arguments(CC_LIST)
|
|
foreach(cc ${CC_LIST})
|
|
if(cc VERSION_LESS 5.3)
|
|
message(FATAL_ERROR "CUDA backend for DNN module requires CC 5.3 or higher. Please remove unsupported architectures from CUDA_ARCH_BIN option.")
|
|
endif()
|
|
endforeach()
|
|
unset(CC_LIST)
|
|
else()
|
|
set(sources_options ${sources_options} EXCLUDE_CUDA)
|
|
endif()
|
|
|
|
ocv_module_include_directories(${include_dirs})
|
|
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
|
ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-suggest-override") # GCC
|
|
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
|
ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-inconsistent-missing-override") # Clang
|
|
endif()
|
|
ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs})
|
|
ocv_create_module(${libs} ${INF_ENGINE_TARGET})
|
|
ocv_add_samples()
|
|
ocv_add_accuracy_tests(${INF_ENGINE_TARGET})
|
|
|
|
set(perf_path "${CMAKE_CURRENT_LIST_DIR}/perf")
|
|
file(GLOB_RECURSE perf_srcs "${perf_path}/*.cpp")
|
|
file(GLOB_RECURSE perf_hdrs "${perf_path}/*.hpp" "${perf_path}/*.h")
|
|
ocv_add_perf_tests(${INF_ENGINE_TARGET}
|
|
FILES test_common "${CMAKE_CURRENT_LIST_DIR}/test/test_common.hpp" "${CMAKE_CURRENT_LIST_DIR}/test/test_common.impl.hpp"
|
|
FILES Src ${perf_srcs}
|
|
FILES Include ${perf_hdrs}
|
|
)
|
|
|
|
ocv_option(${the_module}_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
|
|
ocv_option(${the_module}_PERF_CLCAFFE "Add performance tests of clCaffe framework" OFF)
|
|
if(BUILD_PERF_TESTS)
|
|
if (${the_module}_PERF_CAFFE)
|
|
find_package(Caffe QUIET)
|
|
if (Caffe_FOUND)
|
|
add_definitions(-DHAVE_CAFFE=1)
|
|
ocv_target_link_libraries(opencv_perf_dnn caffe)
|
|
endif()
|
|
elseif(${the_module}_PERF_CLCAFFE)
|
|
find_package(Caffe QUIET)
|
|
if (Caffe_FOUND)
|
|
add_definitions(-DHAVE_CLCAFFE=1)
|
|
ocv_target_link_libraries(opencv_perf_dnn caffe)
|
|
endif()
|
|
endif()
|
|
endif()
|