From 8f91529edfd3a74383a47120f574bb7de09bfe9f Mon Sep 17 00:00:00 2001 From: Elif Albuz Date: Mon, 4 Jul 2016 23:56:15 -0700 Subject: [PATCH] Add Carotene - NVIDIA Hardware-Abstraction-Layer for ARM platforms --- 3rdparty/carotene/.gitignore | 8 + 3rdparty/carotene/CMakeLists.txt | 42 + 3rdparty/carotene/README.md | 2 + 3rdparty/carotene/hal/CMakeLists.txt | 137 + 3rdparty/carotene/hal/tegra_hal.hpp | 1851 +++++++++++ .../carotene/include/carotene/definitions.hpp | 47 + .../carotene/include/carotene/functions.hpp | 2492 +++++++++++++++ 3rdparty/carotene/include/carotene/types.hpp | 125 + 3rdparty/carotene/src/absdiff.cpp | 241 ++ 3rdparty/carotene/src/accumulate.cpp | 408 +++ 3rdparty/carotene/src/add.cpp | 475 +++ 3rdparty/carotene/src/add_weighted.cpp | 265 ++ 3rdparty/carotene/src/bitwise.cpp | 225 ++ 3rdparty/carotene/src/blur.cpp | 1337 ++++++++ 3rdparty/carotene/src/canny.cpp | 773 +++++ 3rdparty/carotene/src/channel_extract.cpp | 486 +++ 3rdparty/carotene/src/channels_combine.cpp | 389 +++ 3rdparty/carotene/src/cmp.cpp | 340 ++ 3rdparty/carotene/src/colorconvert.cpp | 2846 +++++++++++++++++ 3rdparty/carotene/src/common.cpp | 108 + 3rdparty/carotene/src/common.hpp | 96 + 3rdparty/carotene/src/convert.cpp | 1331 ++++++++ 3rdparty/carotene/src/convert_depth.cpp | 399 +++ 3rdparty/carotene/src/convert_scale.cpp | 2498 +++++++++++++++ 3rdparty/carotene/src/convolution.cpp | 340 ++ 3rdparty/carotene/src/count_nonzero.cpp | 430 +++ 3rdparty/carotene/src/div.cpp | 694 ++++ 3rdparty/carotene/src/dot_product.cpp | 260 ++ 3rdparty/carotene/src/fast.cpp | 428 +++ 3rdparty/carotene/src/fill_minmaxloc.cpp | 442 +++ 3rdparty/carotene/src/flip.cpp | 222 ++ 3rdparty/carotene/src/gaussian_blur.cpp | 1059 ++++++ 3rdparty/carotene/src/in_range.cpp | 195 ++ 3rdparty/carotene/src/integral.cpp | 238 ++ 3rdparty/carotene/src/intrinsics.hpp | 112 + 3rdparty/carotene/src/laplacian.cpp | 713 +++++ 3rdparty/carotene/src/magnitude.cpp | 160 + 3rdparty/carotene/src/meanstddev.cpp | 163 + 3rdparty/carotene/src/median_filter.cpp | 227 ++ 3rdparty/carotene/src/min_max.cpp | 139 + 3rdparty/carotene/src/minmaxloc.cpp | 1340 ++++++++ 3rdparty/carotene/src/morph.cpp | 728 +++++ 3rdparty/carotene/src/mul.cpp | 1572 +++++++++ 3rdparty/carotene/src/norm.cpp | 1310 ++++++++ 3rdparty/carotene/src/opticalflow.cpp | 539 ++++ 3rdparty/carotene/src/phase.cpp | 274 ++ 3rdparty/carotene/src/pyramid.cpp | 1414 ++++++++ 3rdparty/carotene/src/reduce.cpp | 460 +++ 3rdparty/carotene/src/remap.cpp | 694 ++++ 3rdparty/carotene/src/remap.hpp | 85 + 3rdparty/carotene/src/resize.cpp | 2191 +++++++++++++ 3rdparty/carotene/src/saturate_cast.hpp | 199 ++ 3rdparty/carotene/src/scharr.cpp | 219 ++ 3rdparty/carotene/src/separable_filter.cpp | 109 + 3rdparty/carotene/src/separable_filter.hpp | 1161 +++++++ 3rdparty/carotene/src/sobel.cpp | 317 ++ 3rdparty/carotene/src/sub.cpp | 621 ++++ 3rdparty/carotene/src/sum.cpp | 385 +++ 3rdparty/carotene/src/template_matching.cpp | 241 ++ 3rdparty/carotene/src/threshold.cpp | 1627 ++++++++++ 3rdparty/carotene/src/vtransform.hpp | 689 ++++ 3rdparty/carotene/src/warp_affine.cpp | 434 +++ 3rdparty/carotene/src/warp_perspective.cpp | 464 +++ 63 files changed, 39816 insertions(+) create mode 100644 3rdparty/carotene/.gitignore create mode 100644 3rdparty/carotene/CMakeLists.txt create mode 100644 3rdparty/carotene/README.md create mode 100644 3rdparty/carotene/hal/CMakeLists.txt create mode 100644 3rdparty/carotene/hal/tegra_hal.hpp create mode 100644 3rdparty/carotene/include/carotene/definitions.hpp create mode 100644 3rdparty/carotene/include/carotene/functions.hpp create mode 100644 3rdparty/carotene/include/carotene/types.hpp create mode 100644 3rdparty/carotene/src/absdiff.cpp create mode 100644 3rdparty/carotene/src/accumulate.cpp create mode 100644 3rdparty/carotene/src/add.cpp create mode 100644 3rdparty/carotene/src/add_weighted.cpp create mode 100644 3rdparty/carotene/src/bitwise.cpp create mode 100644 3rdparty/carotene/src/blur.cpp create mode 100644 3rdparty/carotene/src/canny.cpp create mode 100644 3rdparty/carotene/src/channel_extract.cpp create mode 100644 3rdparty/carotene/src/channels_combine.cpp create mode 100644 3rdparty/carotene/src/cmp.cpp create mode 100644 3rdparty/carotene/src/colorconvert.cpp create mode 100644 3rdparty/carotene/src/common.cpp create mode 100644 3rdparty/carotene/src/common.hpp create mode 100644 3rdparty/carotene/src/convert.cpp create mode 100644 3rdparty/carotene/src/convert_depth.cpp create mode 100644 3rdparty/carotene/src/convert_scale.cpp create mode 100644 3rdparty/carotene/src/convolution.cpp create mode 100644 3rdparty/carotene/src/count_nonzero.cpp create mode 100644 3rdparty/carotene/src/div.cpp create mode 100644 3rdparty/carotene/src/dot_product.cpp create mode 100644 3rdparty/carotene/src/fast.cpp create mode 100644 3rdparty/carotene/src/fill_minmaxloc.cpp create mode 100644 3rdparty/carotene/src/flip.cpp create mode 100644 3rdparty/carotene/src/gaussian_blur.cpp create mode 100644 3rdparty/carotene/src/in_range.cpp create mode 100644 3rdparty/carotene/src/integral.cpp create mode 100644 3rdparty/carotene/src/intrinsics.hpp create mode 100644 3rdparty/carotene/src/laplacian.cpp create mode 100644 3rdparty/carotene/src/magnitude.cpp create mode 100644 3rdparty/carotene/src/meanstddev.cpp create mode 100644 3rdparty/carotene/src/median_filter.cpp create mode 100644 3rdparty/carotene/src/min_max.cpp create mode 100644 3rdparty/carotene/src/minmaxloc.cpp create mode 100644 3rdparty/carotene/src/morph.cpp create mode 100644 3rdparty/carotene/src/mul.cpp create mode 100644 3rdparty/carotene/src/norm.cpp create mode 100644 3rdparty/carotene/src/opticalflow.cpp create mode 100644 3rdparty/carotene/src/phase.cpp create mode 100644 3rdparty/carotene/src/pyramid.cpp create mode 100644 3rdparty/carotene/src/reduce.cpp create mode 100644 3rdparty/carotene/src/remap.cpp create mode 100644 3rdparty/carotene/src/remap.hpp create mode 100644 3rdparty/carotene/src/resize.cpp create mode 100644 3rdparty/carotene/src/saturate_cast.hpp create mode 100644 3rdparty/carotene/src/scharr.cpp create mode 100644 3rdparty/carotene/src/separable_filter.cpp create mode 100644 3rdparty/carotene/src/separable_filter.hpp create mode 100644 3rdparty/carotene/src/sobel.cpp create mode 100644 3rdparty/carotene/src/sub.cpp create mode 100644 3rdparty/carotene/src/sum.cpp create mode 100644 3rdparty/carotene/src/template_matching.cpp create mode 100644 3rdparty/carotene/src/threshold.cpp create mode 100644 3rdparty/carotene/src/vtransform.hpp create mode 100644 3rdparty/carotene/src/warp_affine.cpp create mode 100644 3rdparty/carotene/src/warp_perspective.cpp diff --git a/3rdparty/carotene/.gitignore b/3rdparty/carotene/.gitignore new file mode 100644 index 0000000000..062445879b --- /dev/null +++ b/3rdparty/carotene/.gitignore @@ -0,0 +1,8 @@ +# Gedit temp files +*~ + +# Qt Creator file +*.user + +# MacOS-specific (Desktop Services Store) +.DS_Store diff --git a/3rdparty/carotene/CMakeLists.txt b/3rdparty/carotene/CMakeLists.txt new file mode 100644 index 0000000000..4dd7807c61 --- /dev/null +++ b/3rdparty/carotene/CMakeLists.txt @@ -0,0 +1,42 @@ +cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR) + +project(Carotene) + +set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions") + +set(CAROTENE_INCLUDE_DIR include) +set(CAROTENE_SOURCE_DIR src) + +file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp") +file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp" + "${CAROTENE_SOURCE_DIR}/*.hpp") + +include_directories(${CAROTENE_INCLUDE_DIR}) + +if(CMAKE_COMPILER_IS_GNUCC) + set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}") + + # allow more inlines - these parameters improve performance for: + # - matchTemplate about 5-10% + # - goodFeaturesToTrack 10-20% + # - cornerHarris 30% for some cases + + set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000") +endif() + +add_library(carotene_objs OBJECT + ${carotene_headers} + ${carotene_sources} +) + +if(NOT CAROTENE_NS STREQUAL "carotene") + target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}") +endif() + +if(WITH_NEON) + target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON") +endif() + +set_target_properties(carotene_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE) + +add_library(carotene STATIC EXCLUDE_FROM_ALL "$") diff --git a/3rdparty/carotene/README.md b/3rdparty/carotene/README.md new file mode 100644 index 0000000000..fbaae5e970 --- /dev/null +++ b/3rdparty/carotene/README.md @@ -0,0 +1,2 @@ +This is Carotene, a low-level library containing optimized CPU routines +that are useful for computer vision algorithms. diff --git a/3rdparty/carotene/hal/CMakeLists.txt b/3rdparty/carotene/hal/CMakeLists.txt new file mode 100644 index 0000000000..9eaa94a9f8 --- /dev/null +++ b/3rdparty/carotene/hal/CMakeLists.txt @@ -0,0 +1,137 @@ +cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR) + +include(CheckCCompilerFlag) +include(CheckCXXCompilerFlag) + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}") +set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../") + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(ARM TRUE) +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*") + set(AARCH64 TRUE) +endif() + +if(ANDROID AND ARM) + set(WITH_TGPU ON CACHE BOOL "Enable Tegra GPGPU optimization") +endif() + +set(TEGRA_COMPILER_FLAGS "") + +if(CMAKE_COMPILER_IS_GNUCXX) + # Generate unwind information even for functions that can't throw/propagate exceptions. + # This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols. + list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX) + if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6")) + list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2) + else() + list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces + -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2) + endif() + if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" ) + list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange + -floop-strip-mine -floop-parallelize-all -ftree-loop-linear) + endif() +endif() + +if(ARM OR AARCH64) + set(CHECK_TEGRA_HARDWARE_DEFAULT ON) +else() + set(CHECK_TEGRA_HARDWARE_DEFAULT OFF) +endif() +set(CHECK_TEGRA_HARDWARE ${CHECK_TEGRA_HARDWARE_DEFAULT} CACHE BOOL + "Verify Tegra platform before running optimized code") + +string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}") + +if(ANDROID_NATIVE_API_LEVEL LESS 9 AND (WITH_TGPU OR CHECK_TEGRA_HARDWARE)) + message(FATAL_ERROR "GPU support and Hardware detector is not available for API levels below 9. +Please disable Tegra GPU support and hardware detection or configure project for API level 9 or above.") +endif() + +if(ARMEABI_V7A) + if (CMAKE_COMPILER_IS_GNUCXX) + set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" ) + set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" ) + endif() +endif() + +if (CHECK_TEGRA_HARDWARE) + add_definitions(-DCHECK_TEGRA_HARDWARE) +endif() + +if(WITH_TGPU) + add_definitions(-DHAVE_TGPU) +endif() + +if(WITH_LOGS) + add_definitions(-DHAVE_LOGS) +endif() + +set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE) + +function(compile_carotene) + if(ENABLE_NEON) + set(WITH_NEON ON) + endif() + + add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene") + + if(ARM OR AARCH64) + if(CMAKE_BUILD_TYPE) + set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE}) + endif() + check_cxx_compiler_flag("-mfpu=neon" CXX_HAS_MFPU_NEON) + check_c_compiler_flag("-mfpu=neon" C_HAS_MFPU_NEON) + if(${CXX_HAS_MFPU_NEON} AND ${C_HAS_MFPU_NEON}) + get_target_property(old_flags "carotene_objs" COMPILE_FLAGS) + if(old_flags) + set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "${old_flags} -mfpu=neon") + else() + set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "-mfpu=neon") + endif() + endif() + endif() +endfunction() + +compile_carotene() + +include_directories("${CAROTENE_DIR}/include") + +get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS) +set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs}) + + if (CMAKE_COMPILER_IS_GNUCXX) + # allow more inlines - these parameters improve performance for: + # matchTemplate about 5-10% + # goodFeaturesToTrack 10-20% + # cornerHarris 30% for some cases + set_source_files_properties(impl.cpp $ COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000") +# set_source_files_properties(impl.cpp $ COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000") + endif() + +add_library(tegra_hal STATIC $) +set_target_properties(tegra_hal PROPERTIES POSITION_INDEPENDENT_CODE TRUE) +set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}) +set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}") +if(NOT BUILD_SHARED_LIBS) + ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev) +endif() +target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include) + +set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE) +set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE) +set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE) +set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE) + +configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY) +configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY) +configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY) +configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY) diff --git a/3rdparty/carotene/hal/tegra_hal.hpp b/3rdparty/carotene/hal/tegra_hal.hpp new file mode 100644 index 0000000000..f1bf5c67a7 --- /dev/null +++ b/3rdparty/carotene/hal/tegra_hal.hpp @@ -0,0 +1,1851 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2016, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef _tegra_hal_H_INCLUDED_ +#define _tegra_hal_H_INCLUDED_ + +#define CAROTENE_NS carotene_o4t + +#include "carotene/functions.hpp" +#include +#include +#include +#include + +#define RANGE_DATA(type, base, step) reinterpret_cast(const_cast(reinterpret_cast(base)) + static_cast(range.start) * step) + +#define PARALLEL_CORE 0 +#if PARALLEL_CORE + +#define SRC_ARG1 ST * src1_data_, size_t src1_step_, +#define SRC_STORE1 src1_data(src1_data_), src1_step(src1_step_), +#define SRC_VAR1 ST * src1_data; \ + size_t src1_step; +#define SRC_ARG2 ST * src1_data_, size_t src1_step_, \ + ST * src2_data_, size_t src2_step_, +#define SRC_STORE2 src1_data(src1_data_), src1_step(src1_step_), \ + src2_data(src2_data_), src2_step(src2_step_), +#define SRC_VAR2 ST * src1_data; \ + size_t src1_step; \ + ST * src2_data; \ + size_t src2_step; + +#define DST_ARG1 DT * dst1_data_, size_t dst1_step_, +#define DST_STORE1 dst1_data(dst1_data_), dst1_step(dst1_step_), +#define DST_VAR1 DT * dst1_data; \ + size_t dst1_step; + +#define SCALE_ARG0 +#define SCALE_STORE0 +#define SCALE_VAR0 +#define SCALE_ARG1 , double scale_ +#define SCALE_STORE1 , scale(scale_) +#define SCALE_VAR1 double scale; +#define SCALE_ARG3 , const double *scales_ +#define SCALE_STORE3 , scales(scales_, scales_ + 3) +#define SCALE_VAR3 std::vector scales; + +#define TegraGenOp_Invoker(name, func, src_cnt, dst_cnt, scale_cnt, ...) \ +template \ +class TegraGenOp_##name##_Invoker : public cv::ParallelLoopBody \ +{ \ +public: \ + TegraGenOp_##name##_Invoker(SRC_ARG##src_cnt \ + DST_ARG##dst_cnt \ + int width_, int height_ \ + SCALE_ARG##scale_cnt) : \ + cv::ParallelLoopBody(), SRC_STORE##src_cnt \ + DST_STORE##dst_cnt \ + width(width_), height(height_) \ + SCALE_STORE##scale_cnt {} \ + virtual void operator()(const cv::Range& range) const \ + { \ + CAROTENE_NS::func(CAROTENE_NS::Size2D(width, range.end-range.start), __VA_ARGS__); \ + } \ +private: \ + SRC_VAR##src_cnt \ + DST_VAR##dst_cnt \ + int width, height; \ + SCALE_VAR##scale_cnt \ + const TegraGenOp_##name##_Invoker& operator= (const TegraGenOp_##name##_Invoker&); \ +}; + +#define TegraBinaryOp_Invoker(name, func) TegraGenOp_Invoker(name, func, 2, 1, 0, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step ) + +#define TegraBinaryOp_InvokerVAArg(name, func, ...) TegraGenOp_Invoker(name, func, 2, 1, 0, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__) + +#define TEGRA_BINARYOP(type, op, src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_##op##_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraBinaryOp_InvokerVAArg(add, add, CAROTENE_NS::CONVERT_POLICY_SATURATE) /*Original addition use saturated operator, so use the same from CAROTENE*/ + +TegraBinaryOp_Invoker(addf, add) + +TegraBinaryOp_InvokerVAArg(sub, sub, CAROTENE_NS::CONVERT_POLICY_SATURATE) /*Original addition use saturated operator, so use the same from CAROTENE*/ + +TegraBinaryOp_Invoker(subf, sub) + +TegraBinaryOp_Invoker(max, max) + +TegraBinaryOp_Invoker(min, min) + +TegraBinaryOp_Invoker(absDiff, absDiff) + +TegraBinaryOp_Invoker(bitwiseAnd, bitwiseAnd) + +TegraBinaryOp_Invoker(bitwiseOr, bitwiseOr) + +TegraBinaryOp_Invoker(bitwiseXor, bitwiseXor) + +#define TegraUnaryOp_Invoker(name, func) TegraGenOp_Invoker(name, func, 1, 1, 0, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step ) + +TegraUnaryOp_Invoker(bitwiseNot, bitwiseNot) +#define TEGRA_UNARYOP(type, op, src1, sz1, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_##op##_Invoker(src1, sz1, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_add8u +#define cv_hal_add8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add8s +#define cv_hal_add8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add16u +#define cv_hal_add16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add16s +#define cv_hal_add16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add32s +#define cv_hal_add32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add32f +#define cv_hal_add32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, addf, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_add64f +//#define cv_hal_add64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, addf, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub8u +#define cv_hal_sub8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub8s +#define cv_hal_sub8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub16u +#define cv_hal_sub16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub16s +#define cv_hal_sub16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub32s +#define cv_hal_sub32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub32f +#define cv_hal_sub32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, subf, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_sub64f +//#define cv_hal_sub64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, subf, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max8u +#define cv_hal_max8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max8s +#define cv_hal_max8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max16u +#define cv_hal_max16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max16s +#define cv_hal_max16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max32s +#define cv_hal_max32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max32f +#define cv_hal_max32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, max, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_max64f +//#define cv_hal_max64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min8u +#define cv_hal_min8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min8s +#define cv_hal_min8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min16u +#define cv_hal_min16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min16s +#define cv_hal_min16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min32s +#define cv_hal_min32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min32f +#define cv_hal_min32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, min, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_min64f +//#define cv_hal_min64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff8u +#define cv_hal_absdiff8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff8s +#define cv_hal_absdiff8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff16u +#define cv_hal_absdiff16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff16s +#define cv_hal_absdiff16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff32s +#define cv_hal_absdiff32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff32f +#define cv_hal_absdiff32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_absdiff64f +//#define cv_hal_absdiff64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_and8u +#define cv_hal_and8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseAnd, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_or8u +#define cv_hal_or8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseOr, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_xor8u +#define cv_hal_xor8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseXor, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_not8u +#define cv_hal_not8u(src1, sz1, dst, sz, w, h) TEGRA_UNARYOP(CAROTENE_NS::u8, bitwiseNot, src1, sz1, dst, sz, w, h) + +TegraBinaryOp_Invoker(cmpEQ, cmpEQ) +TegraBinaryOp_Invoker(cmpNE, cmpNE) +TegraBinaryOp_Invoker(cmpGT, cmpGT) +TegraBinaryOp_Invoker(cmpGE, cmpGE) +TegraGenOp_Invoker(cmpLT, cmpGT, 2, 1, 0, RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step) +TegraGenOp_Invoker(cmpLE, cmpGE, 2, 1, 0, RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step) +#define TEGRA_CMP(type, src1, sz1, src2, sz2, dst, sz, w, h, op) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + ((op) == cv::CMP_EQ) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpEQ_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_NE) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpNE_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_GT) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpGT_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_GE) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpGE_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_LT) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpLT_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_LE) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpLE_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_cmp8u +#define cv_hal_cmp8u(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::u8, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp8s +#define cv_hal_cmp8s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s8, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp16u +#define cv_hal_cmp16u(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::u16, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp16s +#define cv_hal_cmp16s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s16, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp32s +#define cv_hal_cmp32s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s32, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp32f +#define cv_hal_cmp32f(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::f32, src1, sz1, src2, sz2, dst, sz, w, h, op) +//#undef cv_hal_cmp64f +//#define cv_hal_cmp64f(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::f64, src1, sz1, src2, sz2, dst, sz, w, h, op) + +#define TegraBinaryOpScale_Invoker(name, func, scale_cnt, ...) TegraGenOp_Invoker(name, func, 2, 1, scale_cnt, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__) + +#define TEGRA_BINARYOPSCALE(type, op, src1, sz1, src2, sz2, dst, sz, w, h, scales) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_##op##_Invoker(src1, sz1, src2, sz2, dst, sz, w, h, scales), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraBinaryOpScale_Invoker(mul, mul, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE) + +TegraBinaryOpScale_Invoker(mulf, mul, 1, scale) + +TegraBinaryOpScale_Invoker(div, div, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE) + +TegraBinaryOpScale_Invoker(divf, div, 1, scale) + +#define TegraUnaryOpScale_Invoker(name, func, scale_cnt, ...) TegraGenOp_Invoker(name, func, 1, 1, scale_cnt, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__) + +#define TEGRA_UNARYOPSCALE(type, op, src1, sz1, dst, sz, w, h, scales) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_##op##_Invoker(src1, sz1, dst, sz, w, h, scales), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraUnaryOpScale_Invoker(recip, reciprocal, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE) + +TegraUnaryOpScale_Invoker(recipf, reciprocal, 1, scale) + +#undef cv_hal_mul8u +#define cv_hal_mul8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul8s +#define cv_hal_mul8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul16u +#define cv_hal_mul16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul16s +#define cv_hal_mul16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul32s +#define cv_hal_mul32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul32f +#define cv_hal_mul32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, mulf, src1, sz1, src2, sz2, dst, sz, w, h, scales) +//#undef cv_hal_mul64f +//#define cv_hal_mul64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, mulf, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div8u +#define cv_hal_div8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div8s +#define cv_hal_div8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div16u +#define cv_hal_div16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div16s +#define cv_hal_div16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div32s +#define cv_hal_div32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div32f +#define cv_hal_div32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, divf, src1, sz1, src2, sz2, dst, sz, w, h, scales) +//#undef cv_hal_div64f +//#define cv_hal_div64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, divf, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_recip8u +#define cv_hal_recip8u(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::u8, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip8s +#define cv_hal_recip8s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s8, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip16u +#define cv_hal_recip16u(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::u16, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip16s +#define cv_hal_recip16s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s16, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip32s +#define cv_hal_recip32s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s32, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip32f +#define cv_hal_recip32f(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::f32, recipf, src1, sz1, dst, sz, w, h, scales) +//#undef cv_hal_recip64f +//#define cv_hal_recip64f(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::f64, recipf, src1, sz1, dst, sz, w, h, scales) + +TegraBinaryOpScale_Invoker(addWeighted, addWeighted, 3, scales[0], scales[1], scales[2]) + +#undef cv_hal_addWeighted8u +#define cv_hal_addWeighted8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_addWeighted8s +#define cv_hal_addWeighted8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_addWeighted16u +#define cv_hal_addWeighted16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_addWeighted16s +#define cv_hal_addWeighted16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_addWeighted32s +#define cv_hal_addWeighted32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +//#undef cv_hal_addWeighted32f +//#define cv_hal_addWeighted32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +//#undef cv_hal_addWeighted64f +//#define cv_hal_addWeighted64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) + +#else + +#define TEGRA_ADD(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::add(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), /*Original addition use saturated operator*/ \ + /*so use the same from CAROTENE*/ \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_ADDF(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::add(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_SUB(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::sub(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), /*Original addition use saturated operator*/ \ + /*so use the same from CAROTENE*/ \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_SUBF(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::sub(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_MAX(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::max(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_MIN(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::min(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_ABSDIFF(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::absDiff(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_AND(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::bitwiseAnd(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) +#define TEGRA_OR(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::bitwiseOr(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_XOR(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::bitwiseXor(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_NOT(src1, sz1, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::bitwiseNot(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_add8u +#define cv_hal_add8u TEGRA_ADD +#undef cv_hal_add8s +#define cv_hal_add8s TEGRA_ADD +#undef cv_hal_add16u +#define cv_hal_add16u TEGRA_ADD +#undef cv_hal_add16s +#define cv_hal_add16s TEGRA_ADD +#undef cv_hal_add32s +#define cv_hal_add32s TEGRA_ADD +#undef cv_hal_add32f +#define cv_hal_add32f TEGRA_ADDF +//#undef cv_hal_add64f +//#define cv_hal_add64f TEGRA_ADDF +#undef cv_hal_sub8u +#define cv_hal_sub8u TEGRA_SUB +#undef cv_hal_sub8s +#define cv_hal_sub8s TEGRA_SUB +#undef cv_hal_sub16u +#define cv_hal_sub16u TEGRA_SUB +#undef cv_hal_sub16s +#define cv_hal_sub16s TEGRA_SUB +#undef cv_hal_sub32s +#define cv_hal_sub32s TEGRA_SUB +#undef cv_hal_sub32f +#define cv_hal_sub32f TEGRA_SUBF +//#undef cv_hal_sub64f +//#define cv_hal_sub64f TEGRA_SUBF +#undef cv_hal_max8u +#define cv_hal_max8u TEGRA_MAX +#undef cv_hal_max8s +#define cv_hal_max8s TEGRA_MAX +#undef cv_hal_max16u +#define cv_hal_max16u TEGRA_MAX +#undef cv_hal_max16s +#define cv_hal_max16s TEGRA_MAX +#undef cv_hal_max32s +#define cv_hal_max32s TEGRA_MAX +#undef cv_hal_max32f +#define cv_hal_max32f TEGRA_MAX +//#undef cv_hal_max64f +//#define cv_hal_max64f TEGRA_MAX +#undef cv_hal_min8u +#define cv_hal_min8u TEGRA_MIN +#undef cv_hal_min8s +#define cv_hal_min8s TEGRA_MIN +#undef cv_hal_min16u +#define cv_hal_min16u TEGRA_MIN +#undef cv_hal_min16s +#define cv_hal_min16s TEGRA_MIN +#undef cv_hal_min32s +#define cv_hal_min32s TEGRA_MIN +#undef cv_hal_min32f +#define cv_hal_min32f TEGRA_MIN +//#undef cv_hal_min64f +//#define cv_hal_min64f TEGRA_MIN +#undef cv_hal_absdiff8u +#define cv_hal_absdiff8u TEGRA_ABSDIFF +#undef cv_hal_absdiff8s +#define cv_hal_absdiff8s TEGRA_ABSDIFF +#undef cv_hal_absdiff16u +#define cv_hal_absdiff16u TEGRA_ABSDIFF +#undef cv_hal_absdiff16s +#define cv_hal_absdiff16s TEGRA_ABSDIFF +#undef cv_hal_absdiff32s +#define cv_hal_absdiff32s TEGRA_ABSDIFF +#undef cv_hal_absdiff32f +#define cv_hal_absdiff32f TEGRA_ABSDIFF +//#undef cv_hal_absdiff64f +//#define cv_hal_absdiff64f TEGRA_ABSDIFF +#undef cv_hal_and8u +#define cv_hal_and8u TEGRA_AND +#undef cv_hal_or8u +#define cv_hal_or8u TEGRA_OR +#undef cv_hal_xor8u +#define cv_hal_xor8u TEGRA_XOR +#undef cv_hal_not8u +#define cv_hal_not8u TEGRA_NOT + +#define TEGRA_CMP(src1, sz1, src2, sz2, dst, sz, w, h, op) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + ((op) == cv::CMP_EQ) ? \ + CAROTENE_NS::cmpEQ(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_NE) ? \ + CAROTENE_NS::cmpNE(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_GT) ? \ + CAROTENE_NS::cmpGT(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_GE) ? \ + CAROTENE_NS::cmpGE(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_LT) ? \ + CAROTENE_NS::cmpGT(CAROTENE_NS::Size2D(w, h), \ + src2, sz2, \ + src1, sz1, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_LE) ? \ + CAROTENE_NS::cmpGE(CAROTENE_NS::Size2D(w, h), \ + src2, sz2, \ + src1, sz1, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_cmp8u +#define cv_hal_cmp8u TEGRA_CMP +#undef cv_hal_cmp8s +#define cv_hal_cmp8s TEGRA_CMP +#undef cv_hal_cmp16u +#define cv_hal_cmp16u TEGRA_CMP +#undef cv_hal_cmp16s +#define cv_hal_cmp16s TEGRA_CMP +#undef cv_hal_cmp32s +#define cv_hal_cmp32s TEGRA_CMP +#undef cv_hal_cmp32f +#define cv_hal_cmp32f TEGRA_CMP +//#undef cv_hal_cmp64f +//#define cv_hal_cmp64f TEGRA_CMP + +#define TEGRA_MUL(src1, sz1, src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::mul(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + scale, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_MULF(src1, sz1, src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::mul(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + (float)scale), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_DIV(src1, sz1, src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::div(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + scale, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_DIVF(src1, sz1, src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::div(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + (float)scale), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_RECIP(src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::reciprocal(CAROTENE_NS::Size2D(w, h), \ + src2, sz2, \ + dst, sz, \ + scale, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_RECIPF(src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::reciprocal(CAROTENE_NS::Size2D(w, h), \ + src2, sz2, \ + dst, sz, \ + (float)scale), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_mul8u +#define cv_hal_mul8u TEGRA_MUL +#undef cv_hal_mul8s +#define cv_hal_mul8s TEGRA_MUL +#undef cv_hal_mul16u +#define cv_hal_mul16u TEGRA_MUL +#undef cv_hal_mul16s +#define cv_hal_mul16s TEGRA_MUL +#undef cv_hal_mul32s +#define cv_hal_mul32s TEGRA_MUL +#undef cv_hal_mul32f +#define cv_hal_mul32f TEGRA_MULF +//#undef cv_hal_mul64f +//#define cv_hal_mul64f TEGRA_MULF +#undef cv_hal_div8u +#define cv_hal_div8u TEGRA_DIV +#undef cv_hal_div8s +#define cv_hal_div8s TEGRA_DIV +#undef cv_hal_div16u +#define cv_hal_div16u TEGRA_DIV +#undef cv_hal_div16s +#define cv_hal_div16s TEGRA_DIV +#undef cv_hal_div32s +#define cv_hal_div32s TEGRA_DIV +#undef cv_hal_div32f +#define cv_hal_div32f TEGRA_DIVF +//#undef cv_hal_div64f +//#define cv_hal_div64f TEGRA_DIVF +#undef cv_hal_recip8u +#define cv_hal_recip8u TEGRA_RECIP +#undef cv_hal_recip8s +#define cv_hal_recip8s TEGRA_RECIP +#undef cv_hal_recip16u +#define cv_hal_recip16u TEGRA_RECIP +#undef cv_hal_recip16s +#define cv_hal_recip16s TEGRA_RECIP +#undef cv_hal_recip32s +#define cv_hal_recip32s TEGRA_RECIP +#undef cv_hal_recip32f +#define cv_hal_recip32f TEGRA_RECIPF +//#undef cv_hal_recip64f +//#define cv_hal_recip64f TEGRA_RECIPF + +#define TEGRA_ADDWEIGHTED(src1, sz1, src2, sz2, dst, sz, w, h, scales) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::addWeighted(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + ((double *)scales)[0], ((double *)scales)[1], ((double *)scales)[2]), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_addWeighted8u +#define cv_hal_addWeighted8u TEGRA_ADDWEIGHTED +#undef cv_hal_addWeighted8s +#define cv_hal_addWeighted8s TEGRA_ADDWEIGHTED +#undef cv_hal_addWeighted16u +#define cv_hal_addWeighted16u TEGRA_ADDWEIGHTED +#undef cv_hal_addWeighted16s +#define cv_hal_addWeighted16s TEGRA_ADDWEIGHTED +#undef cv_hal_addWeighted32s +#define cv_hal_addWeighted32s TEGRA_ADDWEIGHTED +//#undef cv_hal_addWeighted32f +//#define cv_hal_addWeighted32f TEGRA_ADDWEIGHTED +//#undef cv_hal_addWeighted64f +//#define cv_hal_addWeighted64f TEGRA_ADDWEIGHTED + +#endif //PARALLEL_CORE + +#define ROW_SRC_ARG1 const ST * src1_data_ +#define ROW_SRC_STORE1 , src1_data(src1_data_) +#define ROW_SRC_VAR1 const ST * src1_data; +#define ROW_SRC_ARG2 ROW_SRC_ARG1 \ + , const ST * src2_data_ +#define ROW_SRC_STORE2 ROW_SRC_STORE1 \ + , src2_data(src2_data_) +#define ROW_SRC_VAR2 ROW_SRC_VAR1 \ + const ST * src2_data; +#define ROW_SRC_ARG3 ROW_SRC_ARG2 \ + , const ST * src3_data_ +#define ROW_SRC_STORE3 ROW_SRC_STORE2 \ + , src3_data(src3_data_) +#define ROW_SRC_VAR3 ROW_SRC_VAR2 \ + const ST * src3_data; +#define ROW_SRC_ARG4 ROW_SRC_ARG3 \ + , const ST * src4_data_ +#define ROW_SRC_STORE4 ROW_SRC_STORE3 \ + , src4_data(src4_data_) +#define ROW_SRC_VAR4 ROW_SRC_VAR3 \ + const ST * src4_data; + +#define ROW_DST_ARG1 , DT * dst1_data_ +#define ROW_DST_STORE1 , dst1_data(dst1_data_) +#define ROW_DST_VAR1 DT * dst1_data; +#define ROW_DST_ARG2 ROW_DST_ARG1 \ + , DT * dst2_data_ +#define ROW_DST_STORE2 ROW_DST_STORE1 \ + , dst2_data(dst2_data_) +#define ROW_DST_VAR2 ROW_DST_VAR1 \ + DT * dst2_data; +#define ROW_DST_ARG3 ROW_DST_ARG2 \ + , DT * dst3_data_ +#define ROW_DST_STORE3 ROW_DST_STORE2 \ + , dst3_data(dst3_data_) +#define ROW_DST_VAR3 ROW_DST_VAR2 \ + DT * dst3_data; +#define ROW_DST_ARG4 ROW_DST_ARG3 \ + , DT * dst4_data_ +#define ROW_DST_STORE4 ROW_DST_STORE3 \ + , dst4_data(dst4_data_) +#define ROW_DST_VAR4 ROW_DST_VAR3 \ + DT * dst4_data; + +#define ROW_VAL_ARG0 +#define ROW_VAL_STORE0 +#define ROW_VAL_VAR0 +#define ROW_VAL_ARG1 , double val_ +#define ROW_VAL_STORE1 , val(val_) +#define ROW_VAL_VAR1 double val; + +#define TegraRowOp_Invoker(name, func, src_cnt, dst_cnt, val_cnt, ...) \ +template \ +class TegraRowOp_##name##_Invoker : public cv::ParallelLoopBody \ +{ \ +public: \ + TegraRowOp_##name##_Invoker(ROW_SRC_ARG##src_cnt \ + ROW_DST_ARG##dst_cnt \ + ROW_VAL_ARG##val_cnt) : \ + cv::ParallelLoopBody() ROW_SRC_STORE##src_cnt \ + ROW_DST_STORE##dst_cnt \ + ROW_VAL_STORE##val_cnt {} \ + virtual void operator()(const cv::Range& range) const \ + { \ + CAROTENE_NS::func(CAROTENE_NS::Size2D(range.end-range.start, 1), __VA_ARGS__); \ + } \ +private: \ + ROW_SRC_VAR##src_cnt \ + ROW_DST_VAR##dst_cnt \ + ROW_VAL_VAR##val_cnt \ + const TegraRowOp_##name##_Invoker& operator= (const TegraRowOp_##name##_Invoker&); \ +}; + + +#define TEGRA_SPLIT(src, dst, len, cn) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + cn == 2 ? \ + CAROTENE_NS::split2(CAROTENE_NS::Size2D(len, 1), \ + src, len, \ + dst[0], len, \ + dst[1], len), \ + CV_HAL_ERROR_OK : \ + cn == 3 ? \ + CAROTENE_NS::split3(CAROTENE_NS::Size2D(len, 1), \ + src, len, \ + dst[0], len, \ + dst[1], len, \ + dst[2], len), \ + CV_HAL_ERROR_OK : \ + cn == 4 ? \ + CAROTENE_NS::split4(CAROTENE_NS::Size2D(len, 1), \ + src, len, \ + dst[0], len, \ + dst[1], len, \ + dst[2], len, \ + dst[3], len), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraRowOp_Invoker(split2, split2, 1, 2, 0, RANGE_DATA(ST, src1_data, 2*sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start) +TegraRowOp_Invoker(split3, split3, 1, 3, 0, RANGE_DATA(ST, src1_data, 3*sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst3_data, sizeof(DT)), range.end-range.start) +TegraRowOp_Invoker(split4, split4, 1, 4, 0, RANGE_DATA(ST, src1_data, 4*sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst3_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst4_data, sizeof(DT)), range.end-range.start) +#define TEGRA_SPLIT64S(type, src, dst, len, cn) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + cn == 2 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_split2_Invoker(src, dst[0], dst[1]), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + cn == 3 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_split3_Invoker(src, dst[0], dst[1], dst[2]), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + cn == 4 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_split4_Invoker(src, dst[0], dst[1], dst[2], dst[3]), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_MERGE(src, dst, len, cn) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + cn == 2 ? \ + CAROTENE_NS::combine2(CAROTENE_NS::Size2D(len, 1), \ + src[0], len, \ + src[1], len, \ + dst, len), \ + CV_HAL_ERROR_OK : \ + cn == 3 ? \ + CAROTENE_NS::combine3(CAROTENE_NS::Size2D(len, 1), \ + src[0], len, \ + src[1], len, \ + src[2], len, \ + dst, len), \ + CV_HAL_ERROR_OK : \ + cn == 4 ? \ + CAROTENE_NS::combine4(CAROTENE_NS::Size2D(len, 1), \ + src[0], len, \ + src[1], len, \ + src[2], len, \ + src[3], len, \ + dst, len), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraRowOp_Invoker(combine2, combine2, 2, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, 2*sizeof(DT)), range.end-range.start) +TegraRowOp_Invoker(combine3, combine3, 3, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src3_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, 3*sizeof(DT)), range.end-range.start) +TegraRowOp_Invoker(combine4, combine4, 4, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src3_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src4_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, 4*sizeof(DT)), range.end-range.start) +#define TEGRA_MERGE64S(type, src, dst, len, cn) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + cn == 2 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_combine2_Invoker(src[0], src[1], dst), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + cn == 3 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_combine3_Invoker(src[0], src[1], src[2], dst), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + cn == 4 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_combine4_Invoker(src[0], src[1], src[2], src[3], dst), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_split8u +#define cv_hal_split8u TEGRA_SPLIT +#undef cv_hal_split16u +#define cv_hal_split16u TEGRA_SPLIT +#undef cv_hal_split32s +#define cv_hal_split32s TEGRA_SPLIT +#undef cv_hal_split64s +#define cv_hal_split64s(src, dst, len, cn) TEGRA_SPLIT64S(CAROTENE_NS::s64, src, dst, len, cn) + +#undef cv_hal_merge8u +#define cv_hal_merge8u TEGRA_MERGE +#undef cv_hal_merge16u +#define cv_hal_merge16u TEGRA_MERGE +#undef cv_hal_merge32s +#define cv_hal_merge32s TEGRA_MERGE +#undef cv_hal_merge64s +#define cv_hal_merge64s(src, dst, len, cn) TEGRA_MERGE64S(CAROTENE_NS::s64, src, dst, len, cn) + + +TegraRowOp_Invoker(phase, phase, 2, 1, 1, RANGE_DATA(ST, src1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, val) +#define TEGRA_FASTATAN(y, x, dst, len, angleInDegrees) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_phase_Invoker(x, y, dst, angleInDegrees ? 1.0f : M_PI/180), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_fastAtan32f +#define cv_hal_fastAtan32f TEGRA_FASTATAN + +TegraRowOp_Invoker(magnitude, magnitude, 2, 1, 0, RANGE_DATA(ST, src1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start) +#define TEGRA_MAGNITUDE(x, y, dst, len) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_magnitude_Invoker(x, y, dst), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_magnitude32f +#define cv_hal_magnitude32f TEGRA_MAGNITUDE + + +#if defined OPENCV_IMGPROC_HAL_INTERFACE_H + +struct cvhalFilter2D; + +struct FilterCtx +{ + CAROTENE_NS::Size2D ksize; + CAROTENE_NS::s16* kernel_data; + CAROTENE_NS::BORDER_MODE border; +}; +inline int TEGRA_FILTERINIT(cvhalFilter2D **context, uchar *kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, + int max_width, int max_height, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool allowSubmatrix, bool allowInplace) +{ + if(!context || !kernel_data || allowSubmatrix || allowInplace || + src_type != CV_8UC1 || dst_type != CV_8UC1 || + delta != 0 || anchor_x != kernel_width / 2 || anchor_y != kernel_height / 2 ) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + FilterCtx* ctx = new FilterCtx; + if(!ctx) + return CV_HAL_ERROR_UNKNOWN; + ctx->ksize.width = kernel_width; + ctx->ksize.height = kernel_height; + switch(borderType) + { + case CV_HAL_BORDER_CONSTANT: + ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT; + break; + case CV_HAL_BORDER_REPLICATE: + ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE; + break; + case CV_HAL_BORDER_REFLECT: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT; + break; + case CV_HAL_BORDER_WRAP: + ctx->border = CAROTENE_NS::BORDER_MODE_WRAP; + break; + case CV_HAL_BORDER_REFLECT_101: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101; + break; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + if(!CAROTENE_NS::isConvolutionSupported(CAROTENE_NS::Size2D(max_width, max_height), ctx->ksize, ctx->border)) + { + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + ctx->kernel_data = new CAROTENE_NS::s16[kernel_width*kernel_height]; + if(!ctx->kernel_data) + return CV_HAL_ERROR_UNKNOWN; + switch(kernel_type) + { + case CV_8UC1: + convert(ctx->ksize, (CAROTENE_NS::u8*)kernel_data, kernel_step, ctx->kernel_data, kernel_width); + break; + case CV_8SC1: + convert(ctx->ksize, (CAROTENE_NS::s8*)kernel_data, kernel_step, ctx->kernel_data, kernel_width); + break; + case CV_16UC1: + for(int j = 0; j < kernel_height; ++j) + { + std::memcpy(ctx->kernel_data + kernel_width * j, kernel_data + kernel_step * j, kernel_width * sizeof(int16_t)); + } + default: + delete[] ctx->kernel_data; + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + *context = (cvhalFilter2D*)(ctx); + return CV_HAL_ERROR_OK; +} +inline int TEGRA_FILTERFREE(cvhalFilter2D *context) +{ + if(context) + { + if(((FilterCtx*)context)->kernel_data) + delete[] ((FilterCtx*)context)->kernel_data; + delete (FilterCtx*)context; + return CV_HAL_ERROR_OK; + } + else + { + return CV_HAL_ERROR_UNKNOWN; + } +} +#define TEGRA_FILTERIMPL(context, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y) \ +( \ + (void)full_width, (void)full_height, (void)offset_x, (void)offset_y, \ + context && CAROTENE_NS::isConvolutionSupported(CAROTENE_NS::Size2D(width, height), ((FilterCtx*)context)->ksize, ((FilterCtx*)context)->border) ? \ + CAROTENE_NS::convolution(CAROTENE_NS::Size2D(width, height), \ + src_data, src_step, \ + dst_data, dst_step, \ + ((FilterCtx*)context)->border, 0, \ + ((FilterCtx*)context)->ksize, ((FilterCtx*)context)->kernel_data, 1), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_filterInit +#define cv_hal_filterInit TEGRA_FILTERINIT +#undef cv_hal_filter +#define cv_hal_filter TEGRA_FILTERIMPL +#undef cv_hal_filterFree +#define cv_hal_filterFree TEGRA_FILTERFREE + + +struct SepFilterCtx +{ + int16_t kernelx_data[3]; + int16_t kernely_data[3]; + CAROTENE_NS::BORDER_MODE border; +}; +inline int TEGRA_SEPFILTERINIT(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, + uchar *kernelx_data, size_t , int kernelx_width, int kernelx_height, + uchar *kernely_data, size_t kernely_step, int kernely_width, int kernely_height, + int anchor_x, int anchor_y, double delta, int borderType) +{ + if(!context || !kernelx_data || !kernely_data || src_type != CV_8UC1 || dst_type != CV_16SC1 || + !(kernelx_width == 3 && kernelx_height == 1) || !(kernely_width == 1 && kernely_height == 3) || + delta != 0 || anchor_x != 1 || anchor_y != 1) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + SepFilterCtx* ctx = new SepFilterCtx; + if(!ctx) + return CV_HAL_ERROR_UNKNOWN; + switch(borderType) + { + case CV_HAL_BORDER_CONSTANT: + ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT; + break; + case CV_HAL_BORDER_REPLICATE: + ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE; + break; + case CV_HAL_BORDER_REFLECT: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT; + break; + case CV_HAL_BORDER_WRAP: + ctx->border = CAROTENE_NS::BORDER_MODE_WRAP; + break; + case CV_HAL_BORDER_REFLECT_101: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101; + break; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + if(!CAROTENE_NS::isSeparableFilter3x3Supported(CAROTENE_NS::Size2D(16, 16), ctx->border, 3, 3)) + { + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + switch(kernel_type) + { + case CV_8UC1: + ctx->kernelx_data[0]=kernelx_data[0]; + ctx->kernelx_data[1]=kernelx_data[1]; + ctx->kernelx_data[2]=kernelx_data[2]; + ctx->kernely_data[0]=kernely_data[0]; + ctx->kernely_data[1]=kernely_data[kernely_step]; + ctx->kernely_data[2]=kernely_data[2*kernely_step]; + break; + case CV_8SC1: + ctx->kernelx_data[0]=((char*)kernelx_data)[0]; + ctx->kernelx_data[1]=((char*)kernelx_data)[1]; + ctx->kernelx_data[2]=((char*)kernelx_data)[2]; + ctx->kernely_data[0]=((char*)kernely_data)[0]; + ctx->kernely_data[1]=((char*)(kernely_data+kernely_step))[0]; + ctx->kernely_data[2]=((char*)(kernely_data+2*kernely_step))[0]; + break; + case CV_16UC1: + ctx->kernelx_data[0]=((int16_t*)kernelx_data)[0]; + ctx->kernelx_data[1]=((int16_t*)kernelx_data)[1]; + ctx->kernelx_data[2]=((int16_t*)kernelx_data)[2]; + ctx->kernely_data[0]=((int16_t*)kernely_data)[0]; + ctx->kernely_data[1]=((int16_t*)(kernely_data+kernely_step))[0]; + ctx->kernely_data[2]=((int16_t*)(kernely_data+2*kernely_step))[0]; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + *context = (cvhalFilter2D*)(ctx); + return CV_HAL_ERROR_OK; +} +inline int TEGRA_SEPFILTERFREE(cvhalFilter2D *context) +{ + if(context) + { + delete (SepFilterCtx*)context; + return CV_HAL_ERROR_OK; + } + else + { + return CV_HAL_ERROR_UNKNOWN; + } +} +#define TEGRA_SEPFILTERIMPL(context, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y) \ +( \ + context && CAROTENE_NS::isSeparableFilter3x3Supported(CAROTENE_NS::Size2D(width, height), ((SepFilterCtx*)context)->border, 3, 3, \ + CAROTENE_NS::Margin(offset_x, full_width - width - offset_x, offset_y, full_height - height - offset_y)) ? \ + CAROTENE_NS::SeparableFilter3x3(CAROTENE_NS::Size2D(width, height), \ + src_data, src_step, \ + (CAROTENE_NS::s16*)dst_data, dst_step, \ + 3, 3, ((SepFilterCtx*)context)->kernelx_data, ((SepFilterCtx*)context)->kernely_data, \ + ((SepFilterCtx*)context)->border, 0, \ + CAROTENE_NS::Margin(offset_x, full_width - width - offset_x, offset_y, full_height - height - offset_y)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_sepFilterInit +#define cv_hal_sepFilterInit TEGRA_SEPFILTERINIT +#undef cv_hal_sepFilter +#define cv_hal_sepFilter TEGRA_SEPFILTERIMPL +#undef cv_hal_sepFilterFree +#define cv_hal_sepFilterFree TEGRA_SEPFILTERFREE + + +struct MorphCtx +{ + int operation; + int channels; + CAROTENE_NS::Size2D ksize; + int anchor_x, anchor_y; + CAROTENE_NS::BORDER_MODE border; + uchar borderValues[4]; +}; +inline int TEGRA_MORPHINIT(cvhalFilter2D **context, int operation, int src_type, int dst_type, int, int, + int kernel_type, uchar *kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, + int borderType, const double borderValue[4], int iterations, bool allowSubmatrix, bool allowInplace) +{ + if(!context || !kernel_data || src_type != dst_type || + CV_MAT_DEPTH(src_type) != CV_8U || src_type < 0 || (src_type >> CV_CN_SHIFT) > 3 || + + allowSubmatrix || allowInplace || iterations != 1 || + !CAROTENE_NS::isSupportedConfiguration()) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + switch(CV_MAT_DEPTH(kernel_type)) + { + case CV_8U: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + case CV_16U: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (uint16_t*)kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + case CV_32S: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (int32_t*)kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + case CV_32F: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (float*)kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + case CV_64F: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (double*)kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + default: + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + MorphCtx* ctx = new MorphCtx; + if(!ctx) + return CV_HAL_ERROR_UNKNOWN; + ctx->channels = (src_type >> CV_CN_SHIFT) + 1; + ctx->ksize.width = kernel_width; + ctx->ksize.height = kernel_height; + ctx->anchor_x = anchor_x; + ctx->anchor_y = anchor_y; + switch(operation) + { + case MORPH_ERODE: + case MORPH_DILATE: + ctx->operation = operation; + break; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + switch(borderType) + { + case CV_HAL_BORDER_CONSTANT: + ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT; + if( borderValue[0] == DBL_MAX && borderValue[1] == DBL_MAX && borderValue[2] == DBL_MAX && borderValue[3] == DBL_MAX ) + { + if( operation == MORPH_ERODE ) + for(int i = 0; i < ctx->channels; ++i) + ctx->borderValues[i] = (CAROTENE_NS::u8)UCHAR_MAX; + else + for(int i = 0; i < ctx->channels; ++i) + ctx->borderValues[i] = 0; + } + else + { + for(int i = 0; i < ctx->channels; ++i) + ctx->borderValues[i] = (CAROTENE_NS::u8)cv::saturate_cast(borderValue[i]); + } + break; + case CV_HAL_BORDER_REPLICATE: + ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE; + break; + case CV_HAL_BORDER_REFLECT: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT; + break; + case CV_HAL_BORDER_WRAP: + ctx->border = CAROTENE_NS::BORDER_MODE_WRAP; + break; + case CV_HAL_BORDER_REFLECT_101: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101; + break; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + *context = (cvhalFilter2D*)(ctx); + return CV_HAL_ERROR_OK; +} +inline int TEGRA_MORPHFREE(cvhalFilter2D *context) +{ + if(context) + { + delete (MorphCtx*)context; + return CV_HAL_ERROR_OK; + } + else + { + return CV_HAL_ERROR_UNKNOWN; + } +} +#define TEGRA_MORPHIMPL(context, src_data, src_step, dst_data, dst_step, width, height, src_full_width, src_full_height, src_roi_x, src_roi_y, dst_full_width, dst_full_height, dst_roi_x, dst_roi_y) \ +( \ + (void)dst_full_width, (void)dst_full_height, (void)dst_roi_x, (void)dst_roi_y, \ + context && CAROTENE_NS::isSupportedConfiguration() ? \ + ((MorphCtx*)context)->operation == MORPH_ERODE ? \ + CAROTENE_NS::erode(CAROTENE_NS::Size2D(width, height), ((MorphCtx*)context)->channels, \ + src_data, src_step, dst_data, dst_step, \ + ((MorphCtx*)context)->ksize, ((MorphCtx*)context)->anchor_x, ((MorphCtx*)context)->anchor_y, \ + ((MorphCtx*)context)->border, ((MorphCtx*)context)->border, ((MorphCtx*)context)->borderValues, \ + CAROTENE_NS::Margin(src_roi_x, src_full_width - width - src_roi_x, src_roi_y, src_full_height - height - src_roi_y)), \ + CV_HAL_ERROR_OK : \ + ((MorphCtx*)context)->operation == MORPH_DILATE ? \ + CAROTENE_NS::dilate(CAROTENE_NS::Size2D(width, height), ((MorphCtx*)context)->channels, \ + src_data, src_step, dst_data, dst_step, \ + ((MorphCtx*)context)->ksize, ((MorphCtx*)context)->anchor_x, ((MorphCtx*)context)->anchor_y, \ + ((MorphCtx*)context)->border, ((MorphCtx*)context)->border, ((MorphCtx*)context)->borderValues, \ + CAROTENE_NS::Margin(src_roi_x, src_full_width - width - src_roi_x, src_roi_y, src_full_height - height - src_roi_y)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_morphInit +#define cv_hal_morphInit TEGRA_MORPHINIT +#undef cv_hal_morph +#define cv_hal_morph TEGRA_MORPHIMPL +#undef cv_hal_morphFree +#define cv_hal_morphFree TEGRA_MORPHFREE + + + +#define TEGRA_RESIZE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation) \ +( \ + /*interpolation == CV_HAL_INTER_LINEAR ? \ + CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeLinearOpenCVSupported(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), ((src_type >> CV_CN_SHIFT) + 1)) && \ + inv_scale_x > 0 && inv_scale_y > 0 && \ + (dst_width - 0.5)/inv_scale_x - 0.5 < src_width && (dst_height - 0.5)/inv_scale_y - 0.5 < src_height && \ + (dst_width + 0.5)/inv_scale_x + 0.5 >= src_width && (dst_height + 0.5)/inv_scale_y + 0.5 >= src_height && \ + std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \ + CAROTENE_NS::resizeLinearOpenCV(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED :*/ \ + interpolation == CV_HAL_INTER_AREA ? \ + CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeAreaSupported(1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)) && \ + std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \ + CAROTENE_NS::resizeAreaOpenCV(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + /*nearest neighbour interpolation disabled due to rounding accuracy issues*/ \ + /*interpolation == CV_HAL_INTER_NEAREST ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 1) ? \ + CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 1), \ + CV_HAL_ERROR_OK : \ + (src_type == CV_8UC3 || src_type == CV_8SC3) && CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 3) ? \ + CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 3), \ + CV_HAL_ERROR_OK : \ + (src_type == CV_8UC4 || src_type == CV_8SC4 || src_type == CV_16UC2 || src_type == CV_16SC2 || src_type == CV_32SC1) && \ + CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 4) ? \ + CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 4), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED :*/ \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_WARPAFFINE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue) \ +( \ + interpolation == CV_HAL_INTER_NEAREST ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \ + CAROTENE_NS::isWarpAffineNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \ + CAROTENE_NS::warpAffineNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + std::vector(M+0,M+6).data(), \ + dst_data, dst_step, \ + borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \ + (CAROTENE_NS::u8)borderValue[0]), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + interpolation == CV_HAL_INTER_LINEAR ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \ + CAROTENE_NS::isWarpAffineLinearSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \ + CAROTENE_NS::warpAffineLinear(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + std::vector(M+0,M+6).data(), \ + dst_data, dst_step, \ + borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \ + (CAROTENE_NS::u8)borderValue[0]), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_WARPPERSPECTIVE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue) \ +( \ + interpolation == CV_HAL_INTER_NEAREST ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \ + CAROTENE_NS::isWarpPerspectiveNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \ + CAROTENE_NS::warpPerspectiveNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + std::vector(M+0,M+9).data(), \ + dst_data, dst_step, \ + borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \ + (CAROTENE_NS::u8)borderValue[0]), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + interpolation == CV_HAL_INTER_LINEAR ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \ + CAROTENE_NS::isWarpPerspectiveLinearSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \ + CAROTENE_NS::warpPerspectiveLinear(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + std::vector(M+0,M+9).data(), \ + dst_data, dst_step, \ + borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \ + (CAROTENE_NS::u8)borderValue[0]), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_resize +#define cv_hal_resize TEGRA_RESIZE +//warpAffine/warpPerspective disabled due to rounding accuracy issue +//#undef cv_hal_warpAffine +//#define cv_hal_warpAffine TEGRA_WARPAFFINE +//#undef cv_hal_warpPerspective +//#define cv_hal_warpPerspective TEGRA_WARPPERSPECTIVE + + +#define TegraCvtColor_Invoker(name, func, ...) \ +class TegraCvtColor_##name##_Invoker : public cv::ParallelLoopBody \ +{ \ +public: \ + TegraCvtColor_##name##_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, int height_) : \ + cv::ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), height(height_) {} \ + virtual void operator()(const cv::Range& range) const \ + { \ + CAROTENE_NS::func(CAROTENE_NS::Size2D(width, range.end-range.start), __VA_ARGS__); \ + } \ +private: \ + const uchar * src_data; \ + size_t src_step; \ + uchar * dst_data; \ + size_t dst_step; \ + int width, height; \ + const TegraCvtColor_##name##_Invoker& operator= (const TegraCvtColor_##name##_Invoker&); \ +}; + +TegraCvtColor_Invoker(rgb2bgr, rgb2bgr, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgb2bgrx, rgb2bgrx, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgb2rgbx, rgb2rgbx, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2bgr, rgbx2bgr, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2rgb, rgbx2rgb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2bgrx, rgbx2bgrx, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTBGRTOBGR(src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue) \ +( \ + depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + dcn == 3 ? \ + swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2bgr_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + dcn == 4 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2bgrx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2rgbx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + scn == 4 ? \ + dcn == 3 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2bgr_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2rgb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + dcn == 4 ? \ + swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2bgrx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(rgb2bgr565, rgb2bgr565, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgb2rgb565, rgb2rgb565, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2bgr565, rgbx2bgr565, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2rgb565, rgbx2rgb565, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTBGRTOBGR565(src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits) \ +( \ + greenBits == 6 && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2bgr565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2rgb565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + scn == 4 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2bgr565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2rgb565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(rgb2gray, rgb2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(bgr2gray, bgr2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2gray, rgbx2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(bgrx2gray, bgrx2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTBGRTOGRAY(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue) \ +( \ + depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgr2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + scn == 4 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgrx2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(gray2rgb, gray2rgb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(gray2rgbx, gray2rgbx, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTGRAYTOBGR(src_data, src_step, dst_data, dst_step, width, height, depth, dcn) \ +( \ + depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + dcn == 3 ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_gray2rgb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + dcn == 4 ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_gray2rgbx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(rgb2ycrcb, rgb2ycrcb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(bgr2ycrcb, bgr2ycrcb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2ycrcb, rgbx2ycrcb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(bgrx2ycrcb, bgrx2ycrcb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTBGRTOYUV(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr) \ +( \ + isCbCr && depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgr2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + scn == 4 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgrx2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(rgb2hsv, rgb2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 180) +TegraCvtColor_Invoker(bgr2hsv, bgr2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 180) +TegraCvtColor_Invoker(rgbx2hsv, rgbx2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 180) +TegraCvtColor_Invoker(bgrx2hsv, bgrx2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 180) +TegraCvtColor_Invoker(rgb2hsvf, rgb2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 256) +TegraCvtColor_Invoker(bgr2hsvf, bgr2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 256) +TegraCvtColor_Invoker(rgbx2hsvf, rgbx2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 256) +TegraCvtColor_Invoker(bgrx2hsvf, bgrx2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 256) +#define TEGRA_CVTBGRTOHSV(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV) \ +( \ + isHSV && depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + (swapBlue ? \ + isFullRange ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + isFullRange ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgr2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgr2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + scn == 4 ? \ + (swapBlue ? \ + isFullRange ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + isFullRange ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgrx2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgrx2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_CVT2PYUVTOBGR(src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + dcn == 3 ? \ + uIdx == 0 ? \ + (swapBlue ? \ + CAROTENE_NS::yuv420i2rgb(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step) : \ + CAROTENE_NS::yuv420i2bgr(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step)), \ + CV_HAL_ERROR_OK : \ + uIdx == 1 ? \ + (swapBlue ? \ + CAROTENE_NS::yuv420sp2rgb(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step) : \ + CAROTENE_NS::yuv420sp2bgr(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + dcn == 4 ? \ + uIdx == 0 ? \ + (swapBlue ? \ + CAROTENE_NS::yuv420i2rgbx(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step) : \ + CAROTENE_NS::yuv420i2bgrx(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step)), \ + CV_HAL_ERROR_OK : \ + uIdx == 1 ? \ + (swapBlue ? \ + CAROTENE_NS::yuv420sp2rgbx(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step) : \ + CAROTENE_NS::yuv420sp2bgrx(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_cvtBGRtoBGR +#define cv_hal_cvtBGRtoBGR TEGRA_CVTBGRTOBGR +#undef cv_hal_cvtBGRtoBGR5x5 +#define cv_hal_cvtBGRtoBGR5x5 TEGRA_CVTBGRTOBGR565 +#undef cv_hal_cvtBGRtoGray +#define cv_hal_cvtBGRtoGray TEGRA_CVTBGRTOGRAY +#undef cv_hal_cvtGraytoBGR +#define cv_hal_cvtGraytoBGR TEGRA_CVTGRAYTOBGR +#undef cv_hal_cvtBGRtoYUV +#define cv_hal_cvtBGRtoYUV TEGRA_CVTBGRTOYUV +#undef cv_hal_cvtBGRtoHSV +#define cv_hal_cvtBGRtoHSV TEGRA_CVTBGRTOHSV +#undef cv_hal_cvtTwoPlaneYUVtoBGR +#define cv_hal_cvtTwoPlaneYUVtoBGR TEGRA_CVT2PYUVTOBGR + +#endif // OPENCV_IMGPROC_HAL_INTERFACE_H + +#endif diff --git a/3rdparty/carotene/include/carotene/definitions.hpp b/3rdparty/carotene/include/carotene/definitions.hpp new file mode 100644 index 0000000000..124a674d61 --- /dev/null +++ b/3rdparty/carotene/include/carotene/definitions.hpp @@ -0,0 +1,47 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_DEFINITIONS_HPP +#define CAROTENE_DEFINITIONS_HPP + +#ifndef CAROTENE_NS +#define CAROTENE_NS carotene +#endif + +#endif diff --git a/3rdparty/carotene/include/carotene/functions.hpp b/3rdparty/carotene/include/carotene/functions.hpp new file mode 100644 index 0000000000..76d1328194 --- /dev/null +++ b/3rdparty/carotene/include/carotene/functions.hpp @@ -0,0 +1,2492 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_FUNCTIONS_HPP +#define CAROTENE_FUNCTIONS_HPP + +#include +#include + +namespace CAROTENE_NS { + /* If this returns false, none of the functions will work. */ + bool isSupportedConfiguration(); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] + src1[p] + */ + void add(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] - src1[p] + */ + void sub(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + f32 *dstBase, ptrdiff_t dstStride); + + void sub(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] * alpha + src1[p] * beta + gamma + */ + void addWeighted(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + /* + For each point `p` within `size`, do: + dst[p] = min(src0[p], src1[p]) + */ + void min(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = max(src0[p], src1[p]) + */ + void max(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] * src1[p] * scale + + NOTE: ROUND_TO_ZERO convert policy is used + */ + void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f64 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] * scale / src1[p] + + NOTE: ROUND_TO_ZERO convert policy is used + */ + void div(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale); + + /* + For each point `p` within `size`, do: + dst[p] = scale / src[p] + + NOTE: ROUND_TO_ZERO convert policy is used + */ + void reciprocal(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale); + + /* + For each point `p` within `size`, set `dst[p]` to the median + of `src[p]` and the 8 points around it. If `srcMargin` is + zero on any side, get the neighbors on that side by replicating + the edge. + */ + bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels); + void medianFilter3x3(const Size2D &size, u32 numChannels, + const u8 *srcBase, ptrdiff_t srcStride, + const Margin &srcMargin, + u8 *dstBase, ptrdiff_t dstStride); + + /* + Apply a half Gaussian filter + half Scale, as one level of a Gaussian + pyramid. For all `p` within `dstSize`, set `dst[p]` to `f[2 * p]`, where + `f` is an image of size srcSize obtained by filtering src with the 5x5 + Gaussian kernel ([1 4 6 4 1]'*[1 4 6 4 1]/256) using the border mode + passed in, and round-to-zero rounding. + dstSize must be (srcSize.width / 2, srcSize.height / 2), rounded by any method. + */ + bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border); + void gaussianPyramidDownRTZ(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + /* Same as above, but uses round-half-up rounding. */ + + bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidDown(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, u8 cn); + + + bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidDown(const Size2D &srcSize, + const s16 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + s16 *dstBase, ptrdiff_t dstStride, u8 cn); + + bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidDown(const Size2D &srcSize, + const f32 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + f32 *dstBase, ptrdiff_t dstStride, u8 cn); + + bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidUp(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, u8 cn); + + bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidUp(const Size2D &srcSize, + const s16 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + s16 *dstBase, ptrdiff_t dstStride, u8 cn); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? trueValue : falseValue + */ + void thresholdBinary(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 trueValue = 255, u8 falseValue = 0); + + /* + For each point `p` within `size`, do: + dst[p] = lower <= src[p] && src[p] <= upper ? trueValue : falseValue + */ + void thresholdRange(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 lowerThreshold, u8 upperThreshold, + u8 trueValue = 255, u8 falseValue = 0); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? value : 0 + */ + void thresholdBinary(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 value); + + void thresholdBinary(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold, s8 value); + + void thresholdBinary(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold, u16 value); + + void thresholdBinary(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold, s16 value); + + void thresholdBinary(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold, s32 value); + + void thresholdBinary(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold, f32 value); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? 0 : value + */ + void thresholdBinaryInv(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 value); + + void thresholdBinaryInv(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold, s8 value); + + void thresholdBinaryInv(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold, u16 value); + + void thresholdBinaryInv(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold, s16 value); + + void thresholdBinaryInv(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold, s32 value); + + void thresholdBinaryInv(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold, f32 value); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? threshold : src[p] + */ + void thresholdTruncate(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold); + + void thresholdTruncate(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold); + + void thresholdTruncate(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold); + + void thresholdTruncate(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold); + + void thresholdTruncate(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold); + + void thresholdTruncate(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? src[p] : 0 + */ + void thresholdToZero(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold); + + void thresholdToZero(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold); + + void thresholdToZero(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold); + + void thresholdToZero(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold); + + void thresholdToZero(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold); + + void thresholdToZero(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? 0 : src[p] + */ + void thresholdToZeroInv(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold); + + void thresholdToZeroInv(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold); + + void thresholdToZeroInv(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold); + + void thresholdToZeroInv(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold); + + void thresholdToZeroInv(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold); + + void thresholdToZeroInv(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold); + + /* + For each point `p` within `size`, do: + dst[p] = abs(src0[p] - src1[p]) + */ + void absDiff(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u16 *dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = ~src[p] + */ + void bitwiseNot(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] & src1[p] + */ + void bitwiseAnd(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] | src1[p] + */ + void bitwiseOr(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] ^ src1[p] + */ + void bitwiseXor(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] == src1[p] ? 255 : 0 + */ + void cmpEQ(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const u32 *src0Base, ptrdiff_t src0Stride, + const u32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] != src1[p] ? 255 : 0 + */ + void cmpNE(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const u32 *src0Base, ptrdiff_t src0Stride, + const u32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] > src1[p] ? 255 : 0 + */ + void cmpGT(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const u32 *src0Base, ptrdiff_t src0Stride, + const u32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] >= src1[p] ? 255 : 0 + */ + void cmpGE(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const u32 *src0Base, ptrdiff_t src0Stride, + const u32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + Calculates dot product + */ + f64 dotProduct(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride); + + f64 dotProduct(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride); + + f64 dotProduct(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride); + + /* + Calculates mean and stddev + */ + void meanStdDev(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + f32 * pMean, f32 * pStdDev); + + void meanStdDev(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + f32 * pMean, f32 * pStdDev); + + /* + For each point `p` within `size`, do: + dst[p] = sqrt(src0[p] ^ 2 + src1[p] ^ 2) + */ + void magnitude(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride); + + void magnitude(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + f32 *dstBase, ptrdiff_t dstStride); + + /* + Compute an integral image + */ + void integral(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u32 * sumBase, ptrdiff_t sumStride); + + /* + Compute an integral of squared image values + */ + void sqrIntegral(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + f64 * sqsumBase, ptrdiff_t sqsumStride); + + /* + Among each pixel `p` within `src` find min and max values + */ + void minMaxVals(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 * minVal, u8 * maxVal); + + void minMaxVals(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 * minVal, s16 * maxVal); + + void minMaxVals(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 * minVal, u16 * maxVal); + + void minMaxVals(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 * minVal, s32 * maxVal); + + void minMaxVals(const Size2D &size, + const u32 *srcBase, ptrdiff_t srcStride, + u32 * minVal, u32 * maxVal); + + /* + Fill the arrays `minLocPtr`, `maxLocPtr` with locations of + given values `minVal`, `maxVal` + */ + void fillMinMaxLocs(const Size2D & size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + void fillMinMaxLocs(const Size2D & size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + void fillMinMaxLocs(const Size2D & size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + void fillMinMaxLocs(const Size2D & size, + const u32 *srcBase, ptrdiff_t srcStride, + u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + void fillMinMaxLocs(const Size2D & size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + /* + Among each pixel `p` within `src` find min and max values and its first occurences + */ + void minMaxLoc(const Size2D &size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 &minVal, size_t &minCol, size_t &minRow, + s8 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 &minVal, size_t &minCol, size_t &minRow, + u8 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 &minVal, size_t &minCol, size_t &minRow, + s16 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 &minVal, size_t &minCol, size_t &minRow, + u16 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 &minVal, size_t &minCol, size_t &minRow, + s32 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 &minVal, size_t &minCol, size_t &minRow, + f32 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + const u8 * maskBase, ptrdiff_t maskStride, + f32 &minVal, size_t &minCol, size_t &minRow, + f32 &maxVal, size_t &maxCol, size_t &maxRow); + + /* + For each point `p` within `size`, do: + dst[p] += src[p] + */ + void accumulate(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = (dst[p] + ((src[p] ^ 2) >> shift)) + */ + void accumulateSquare(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + u32 shift); + + /* + For each point `p` within `size`, do: + dst[p] = (1 - alpha) * dst[p] + alpha * src[p] + */ + void accumulateWeighted(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + f32 alpha); + + /* + orient[p] = atan2(src0[p], src1[p]) + */ + void phase(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + u8 * orientBase, ptrdiff_t orientStride); + + void phase(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * orientBase, ptrdiff_t orientStride, + f32 scale); + + /* + Combine 2 planes to a single one + */ + void combine2(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void combine2(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void combine2(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void combine2(const Size2D &size, + const s64 * src0Base, ptrdiff_t src0Stride, + const s64 * src1Base, ptrdiff_t src1Stride, + s64 * dstBase, ptrdiff_t dstStride); + + /* + Combine 3 planes to a single one + */ + void combine3(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + const u8 * src2Base, ptrdiff_t src2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void combine3(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + const u16 * src2Base, ptrdiff_t src2Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void combine3(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + const s32 * src2Base, ptrdiff_t src2Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void combine3(const Size2D &size, + const s64 * src0Base, ptrdiff_t src0Stride, + const s64 * src1Base, ptrdiff_t src1Stride, + const s64 * src2Base, ptrdiff_t src2Stride, + s64 * dstBase, ptrdiff_t dstStride); + + /* + Combine 4 planes to a single one + */ + void combine4(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + const u8 * src2Base, ptrdiff_t src2Stride, + const u8 * src3Base, ptrdiff_t src3Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void combine4(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + const u16 * src2Base, ptrdiff_t src2Stride, + const u16 * src3Base, ptrdiff_t src3Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void combine4(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + const s32 * src2Base, ptrdiff_t src2Stride, + const s32 * src3Base, ptrdiff_t src3Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void combine4(const Size2D &size, + const s64 * src0Base, ptrdiff_t src0Stride, + const s64 * src1Base, ptrdiff_t src1Stride, + const s64 * src2Base, ptrdiff_t src2Stride, + const s64 * src3Base, ptrdiff_t src3Stride, + s64 * dstBase, ptrdiff_t dstStride); + + /* + Combine 3 planes to YUYV one + */ + void combineYUYV(const Size2D &size, + const u8 * srcyBase, ptrdiff_t srcyStride, + const u8 * srcuBase, ptrdiff_t srcuStride, + const u8 * srcvBase, ptrdiff_t srcvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Combine 3 planes to UYVY one + */ + void combineUYVY(const Size2D &size, + const u8 * srcyBase, ptrdiff_t srcyStride, + const u8 * srcuBase, ptrdiff_t srcuStride, + const u8 * srcvBase, ptrdiff_t srcvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to grayscale one + */ + void rgb2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to grayscale one + */ + void rgbx2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert BGR image to grayscale one + */ + void bgr2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert BGRX image to grayscale one + */ + void bgrx2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert grayscale image to RGB one + */ + void gray2rgb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert grayscale image to RGBX one + */ + void gray2rgbx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to RGBX + */ + void rgb2rgbx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to RGB + */ + void rgbx2rgb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to BGR + */ + void rgb2bgr(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to BGRX + */ + void rgbx2bgrx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to BGR + */ + void rgbx2bgr(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to BGRX + */ + void rgb2bgrx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to HSV + */ + void rgb2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange); + + /* + Convert RGBX image to HSV + */ + void rgbx2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange); + + /* + Convert BGR image to HSV + */ + void bgr2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange); + + /* + Convert BGRX image to HSV + */ + void bgrx2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange); + + /* + Convert RGBX image to BGR565 + RRRRrrrr GGGGgggg BBBBbbbb XXXXxxxx -> GggBBBBb RRRRrGGG + */ + void rgbx2bgr565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to BGR565 + RRRRrrrr GGGGgggg BBBBbbbb -> GggBBBBb RRRRrGGG + */ + void rgb2bgr565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to RGB565 + RRRRrrrr GGGGgggg BBBBbbbb XXXXxxxx -> GggRRRRr BBBBbGGG + */ + void rgbx2rgb565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to RGB565 + RRRRrrrr GGGGgggg BBBBbbbb -> GggRRRRr BBBBbGGG + */ + void rgb2rgb565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to YCrCb + */ + void rgb2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to YCrCb + */ + void rgbx2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert BGR image to YCrCb + */ + void bgr2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert BGRX image to YCrCb + */ + void bgrx2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420sp image to RGB + */ + void yuv420sp2rgb(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420sp image to RGBX + */ + void yuv420sp2rgbx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420i image to RGB + */ + void yuv420i2rgb(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420i image to RGBX + */ + void yuv420i2rgbx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420sp image to BGR + */ + void yuv420sp2bgr(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420sp image to BGRX + */ + void yuv420sp2bgrx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420i image to BGR + */ + void yuv420i2bgr(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420i image to BGRX + */ + void yuv420i2bgrx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] << shift + */ + void lshift(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + u32 shift); + + /* + For each point `p` within `size`, do sign-extending shift: + dst[p] = src[p] >> shift + */ + void rshift(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 shift, CONVERT_POLICY cpolicy); + + /* + For each point `p` within `size`, set `dst[p]` to the average + of `src[p]` and the 8 (or 24 for blur5x5) points around it + NOTE: the function cannot operate inplace + */ + bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border); + void blur3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border); + void blur3x3(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue); + + void blur5x5(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue); + + /* + For each point `p` within `size`, set `dst[p]` to the average + of `src[p]` and the 8 points around it + NOTE: the function can operate inplace + */ + bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border); + void blur3x3(const Size2D &size, s32 cn, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, f32 borderValue, Margin borderMargin); + + bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border); + void blur3x3(const Size2D &size, s32 cn, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s32 borderValue, Margin borderMargin); + + /* + For each point `p` within `size`, set `dst[p]` to gaussian smooth + of `src[p]` and the 8(24 for 5x5 version) points around it + NOTE: the function cannot operate inplace + */ + bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border); + void gaussianBlur3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin = Margin()); + void gaussianBlur3x3Margin(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin()); + + bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border); + void gaussianBlur5x5(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue, Margin borderMargin); + + void gaussianBlur5x5(const Size2D &size, s32 cn, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u16 borderValue, Margin borderMargin); + + void gaussianBlur5x5(const Size2D &size, s32 cn, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s16 borderValue, Margin borderMargin); + + void gaussianBlur5x5(const Size2D &size, s32 cn, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s32 borderValue, Margin borderMargin); + + /* + Calculation of Sobel operator + NOTE: the function cannot operate inplace + */ + bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin()); + void Sobel3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin()); + + /* + Calculation of Sobel operator for f32 data + NOTE: the function can operate inplace + */ + bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy); + void Sobel3x3(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE borderType, f32 borderValue); + + /* + Calculation of Scharr operator + NOTE: the function cannot operate inplace + */ + bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin()); + void Scharr3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE borderType, u8 borderValue, Margin borderMargin = Margin()); + + void ScharrDeriv(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + /* + Calculation of generic separable filtering operator + rowFilter/colFilter define filter weights + 0 - predefined 1 2 1 + 1 - predefined -1 0 1 + 2 - predefined 1 -2 1 + 3 - weights provided as xw/yw + */ + bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin()); + void SeparableFilter3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw, + BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin()); + + /* + Extract a single plane from 2 channel image + */ + void extract2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi); + + /* + Extract a single plane from 3 channel image + */ + void extract3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi); + + /* + Extract a single plane from 4 channel image + */ + void extract4(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi); + + /* + Split 2 channel image to separate planes + */ + void split2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dst0Base, ptrdiff_t dst0Stride, + u8 * dst1Base, ptrdiff_t dst1Stride); + + void split2(const Size2D &size, + const u16* srcBase, ptrdiff_t srcStride, + u16 * dst0Base, ptrdiff_t dst0Stride, + u16 * dst1Base, ptrdiff_t dst1Stride); + + void split2(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dst0Base, ptrdiff_t dst0Stride, + s32 * dst1Base, ptrdiff_t dst1Stride); + + void split2(const Size2D &size, + const s64 * srcBase, ptrdiff_t srcStride, + s64 * dst0Base, ptrdiff_t dst0Stride, + s64 * dst1Base, ptrdiff_t dst1Stride); + + /* + Split 3 channel image to separate planes + */ + void split3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dst0Base, ptrdiff_t dst0Stride, + u8 * dst1Base, ptrdiff_t dst1Stride, + u8 * dst2Base, ptrdiff_t dst2Stride); + + void split3(const Size2D &size, + const u16* srcBase, ptrdiff_t srcStride, + u16 * dst0Base, ptrdiff_t dst0Stride, + u16 * dst1Base, ptrdiff_t dst1Stride, + u16 * dst2Base, ptrdiff_t dst2Stride); + + void split3(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dst0Base, ptrdiff_t dst0Stride, + s32 * dst1Base, ptrdiff_t dst1Stride, + s32 * dst2Base, ptrdiff_t dst2Stride); + + void split3(const Size2D &size, + const s64 * srcBase, ptrdiff_t srcStride, + s64 * dst0Base, ptrdiff_t dst0Stride, + s64 * dst1Base, ptrdiff_t dst1Stride, + s64 * dst2Base, ptrdiff_t dst2Stride); + + /* + Split 4 channel image to separate planes + */ + void split4(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dst0Base, ptrdiff_t dst0Stride, + u8 * dst1Base, ptrdiff_t dst1Stride, + u8 * dst2Base, ptrdiff_t dst2Stride, + u8 * dst3Base, ptrdiff_t dst3Stride); + + void split4(const Size2D &size, + const u16* srcBase, ptrdiff_t srcStride, + u16 * dst0Base, ptrdiff_t dst0Stride, + u16 * dst1Base, ptrdiff_t dst1Stride, + u16 * dst2Base, ptrdiff_t dst2Stride, + u16 * dst3Base, ptrdiff_t dst3Stride); + + void split4(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dst0Base, ptrdiff_t dst0Stride, + s32 * dst1Base, ptrdiff_t dst1Stride, + s32 * dst2Base, ptrdiff_t dst2Stride, + s32 * dst3Base, ptrdiff_t dst3Stride); + + void split4(const Size2D &size, + const s64 * srcBase, ptrdiff_t srcStride, + s64 * dst0Base, ptrdiff_t dst0Stride, + s64 * dst1Base, ptrdiff_t dst1Stride, + s64 * dst2Base, ptrdiff_t dst2Stride, + s64 * dst3Base, ptrdiff_t dst3Stride); + + /* + Split 4 channel image to 3 channel image and 1 channel image + */ + void split4(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dst3Base, ptrdiff_t dst3Stride, + u8 * dst1Base, ptrdiff_t dst1Stride); + + /* + Flip image using specified flip mode + */ + bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize); + void flip(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode, u32 elemSize); + + /* + For each point `p` within `size`, set `dst[p]` to the maximum + of `src[p]` and the 8 points around it + NOTE: the function cannot operate inplace + */ + bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border); + + void erode3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + void dilate3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + void erode(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin); + + void dilate(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin); + + /* + Resize a source image using "nearest neighbor" interpolation type + + wr = src_width / dst_width + hr = src_height / dst_height + */ + bool isResizeNearestNeighborSupported(const Size2D &ssize, u32 elemSize); + void resizeNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 elemSize); + + /* + Resize a source image using "area" interpolation type + + wr = src_width / dst_width + hr = src_height / dst_height + */ + bool isResizeAreaSupported(f32 wr, f32 hr, u32 channels); + void resizeAreaOpenCV(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels); + void resizeArea(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels); + + /* + Resize a source image using "linear" interpolation type + + wr = src_width / dst_width + hr = src_height / dst_height + */ + bool isResizeLinearOpenCVSupported(const Size2D &ssize, const Size2D &dsize, u32 channels); + bool isResizeLinearSupported(const Size2D &ssize, const Size2D &dsize, + f32 wr, f32 hr, u32 channels); + void resizeLinearOpenCV(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels); + void resizeLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels); + + /* + For each point `p` within `size`, set `dst[p]` to convolution + of `src[p]` and the (ksize * ksize - 1) points around it + The function uses OpenVX semantic (so, in order to use this function + in OpenCV you should flip kernel in both directions) + NOTE: the function cannot operate inplace + */ + bool isConvolutionSupported(const Size2D &size, const Size2D &ksize, BORDER_MODE border); + void convolution(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue, + const Size2D & ksize, s16 * kernelBase, u32 scale); + + /* + For each point `p` within `dstSize`, does convolution + of tmpl points and size*size square of src points starting with `src[p]`. + Src should be of size (dstSize+size-1)*(dstSize+size-1) + NOTE: the function cannot operate inplace + */ + bool isMatchTemplateSupported(const Size2D &tmplSize); + void matchTemplate(const Size2D &srcSize, + const u8 * srcBase, ptrdiff_t srcStride, + const Size2D &tmplSize, + const u8 * tmplBase, ptrdiff_t tmplStride, + f32 * dstBase, ptrdiff_t dstStride, + bool normalize); + + /* + Calculation of Laplacian operator + + 1 1 1 + 1 -8 1 + 1 1 1 + + NOTE: the function cannot operate inplace + */ + bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border); + void Laplacian3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + /* + OpenCV like calculation of Laplacian operator + + kernel 1 kernel 3 kernel 5 + 0 1 0 2 0 2 1 2 2 2 1 + 1 -4 1 0 -8 0 2 0 -4 0 2 + 0 1 0 2 0 2 2 -4 -12 -4 2 + 2 0 -4 0 2 + 1 2 2 2 1 + + NOTE: the function cannot operate inplace + */ + bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border); + void Laplacian1OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + void Laplacian3OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + void Laplacian5OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + /* + Detect image edges using Canny algorithm + These functions perform derivatives estimation using sobel algorithm + */ + bool isCanny3x3Supported(const Size2D &size); + void Canny3x3L1(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin); + + void Canny3x3L2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin); + + /* + Detect image edges using Canny algorithm + These functions don't estimate derivatives and thus require + precomputed derivatives estimation instead of source image + */ + void Canny3x3L1(const Size2D &size, s32 cn, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh); + + void Canny3x3L2(const Size2D &size, s32 cn, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh); + + /* + Performs detection of FAST features + */ + void FAST(const Size2D &size, + u8 *srcBase, ptrdiff_t srcStride, + KeypointStore *keypoints, + u8 threshold, bool nonmax_suppression); + + /* + Remap a source image using table and specified + extrapolation method + */ + bool isRemapNearestNeighborSupported(const Size2D &ssize); + void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * tableBase, ptrdiff_t tableStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + bool isRemapLinearSupported(const Size2D &ssize); + void remapLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * tableBase, ptrdiff_t tableStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + /* + Perform an affine transform on an input image + + src_x = dst_x * m[0] + dst_y * m[2] + m[4] + src_y = dst_x * m[1] + dst_y * m[3] + m[5] + */ + bool isWarpAffineNearestNeighborSupported(const Size2D &ssize); + void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + bool isWarpAffineLinearSupported(const Size2D &ssize); + void warpAffineLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + /* + Perform a perspective transform on an input image + + src_x = dst_x * m[0] + dst_y * m[3] + m[6] + src_y = dst_x * m[1] + dst_y * m[4] + m[7] + w = dst_x * m[2] + dst_y * m[5] + m[8] + + src_x = w == 0 ? 0 : src_x / w + src_y = w == 0 ? 0 : src_y / w + */ + bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize); + void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + bool isWarpPerspectiveLinearSupported(const Size2D &ssize); + void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + /* + Convert data from source to destination type + */ + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + /* + Convert data from source to destination type with scaling + dst = saturate_cast(src * alpha + beta) + */ + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + /* + Reduce matrix to a vector by calculatin given operation for each column + */ + void reduceColSum(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase); + + void reduceColMax(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase); + + void reduceColMin(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase); + + void reduceColSum(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase); + + void reduceColMax(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase); + + void reduceColMin(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase); + + /* + For each point `p` within `size`, do: + dst[p] = (rng1[p] <= src[p] && src[p] <= rng2[p]) ? 255 : 0 + */ + + void inRange(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + const u8 * rng1Base, ptrdiff_t rng1Stride, + const u8 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + const s8 * rng1Base, ptrdiff_t rng1Stride, + const s8 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + const u16 * rng1Base, ptrdiff_t rng1Stride, + const u16 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + const s16 * rng1Base, ptrdiff_t rng1Stride, + const s16 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + const s32 * rng1Base, ptrdiff_t rng1Stride, + const s32 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + const f32 * rng1Base, ptrdiff_t rng1Stride, + const f32 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Estimate amount of non zero elements + */ + s32 countNonZero(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride); + + s32 countNonZero(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride); + + s32 countNonZero(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride); + + s32 countNonZero(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride); + + s32 countNonZero(const Size2D &_size, + const f64 * srcBase, ptrdiff_t srcStride); + + /* + Calculates sum of all image pixel values and squared values + */ + bool isSumSupported(u32 channels); + + void sum(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u32 * sumdst, u32 channels); + + void sum(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + f64 * sumdst, u32 channels); + + bool isSqsumSupported(u32 channels); + + void sqsum(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + f64 * sumdst, f64 * sqsumdst, u32 channels); + + /* + Calculates norm + */ + s32 normInf(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride); + + s32 normInf(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride); + + s32 normInf(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride); + + s32 normInf(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride); + + s32 normInf(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride); + + f32 normInf(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride); + + s32 normL1(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride); + + s32 normL1(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride); + + s32 normL1(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride); + + s32 normL1(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride); + + f64 normL1(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride); + + f64 normL1(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride); + + s32 normL2(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride); + + s32 normL2(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride); + + f64 normL2(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride); + + f64 normL2(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride); + + f64 normL2(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride); + + f64 normL2(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride); + + /* + Calculates norm of per element difference + */ + s32 diffNormInf(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride); + + f32 diffNormInf(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride); + + s32 diffNormL1(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride); + + f64 diffNormL1(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride); + + s32 diffNormL2(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride); + + f64 diffNormL2(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride); + + /* + * Pyramidal Lucas-Kanade Optical Flow level processing + */ + void pyrLKOptFlowLevel(const Size2D &size, s32 cn, + const u8 *prevData, ptrdiff_t prevStride, + const s16 *prevDerivData, ptrdiff_t prevDerivStride, + const u8 *nextData, ptrdiff_t nextStride, + u32 ptCount, + const f32 *prevPts, f32 *nextPts, + u8 *status, f32 *err, + const Size2D &winSize, + u32 terminationCount, f64 terminationEpsilon, + u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals, + f32 minEigThreshold); +} + +#endif diff --git a/3rdparty/carotene/include/carotene/types.hpp b/3rdparty/carotene/include/carotene/types.hpp new file mode 100644 index 0000000000..81b03d649a --- /dev/null +++ b/3rdparty/carotene/include/carotene/types.hpp @@ -0,0 +1,125 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_TYPES_HPP +#define CAROTENE_TYPES_HPP + +#include +#include +#include + +#ifndef UINT32_MAX + #define UINT32_MAX (4294967295U) +#endif + +namespace CAROTENE_NS { + using std::size_t; + using std::ptrdiff_t; + + typedef int8_t s8; + typedef uint8_t u8; + typedef int16_t s16; + typedef uint16_t u16; + typedef int32_t s32; + typedef uint32_t u32; + typedef float f32; + typedef int64_t s64; + typedef uint64_t u64; + typedef double f64; + + typedef ptrdiff_t stride_t; + + enum CONVERT_POLICY + { + CONVERT_POLICY_WRAP, + CONVERT_POLICY_SATURATE + }; + + enum BORDER_MODE + { + BORDER_MODE_UNDEFINED, + BORDER_MODE_CONSTANT, + BORDER_MODE_REPLICATE, + BORDER_MODE_REFLECT, + BORDER_MODE_REFLECT101, + BORDER_MODE_WRAP + }; + + enum FLIP_MODE + { + FLIP_HORIZONTAL_MODE = 1, + FLIP_VERTICAL_MODE = 2, + FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE + }; + + enum COLOR_SPACE + { + COLOR_SPACE_BT601, + COLOR_SPACE_BT709 + }; + + struct Size2D { + Size2D() : width(0), height(0) {} + Size2D(size_t width_, size_t height_) : width(width_), height(height_) {} + + size_t width; + size_t height; + + inline size_t total() const + { + return width * height; + } + }; + + struct Margin { + Margin() : left(0), right(0), top(0), bottom(0) {} + Margin(size_t left_, size_t right_, size_t top_, size_t bottom_) + : left(left_), right(right_), top(top_), bottom(bottom_) {} + + // these are measured in elements + size_t left, right, top, bottom; + }; + + struct KeypointStore { + virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0; + virtual ~KeypointStore() {}; + }; +} + +#endif diff --git a/3rdparty/carotene/src/absdiff.cpp b/3rdparty/carotene/src/absdiff.cpp new file mode 100644 index 0000000000..02008ceb3e --- /dev/null +++ b/3rdparty/carotene/src/absdiff.cpp @@ -0,0 +1,241 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +struct AbsDiff +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vabdq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vabd(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0]; + } +}; + +template +struct AbsDiffSigned +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + typename internal::VecTraits::vec128 v_min = internal::vminq(v_src0, v_src1); + typename internal::VecTraits::vec128 v_max = internal::vmaxq(v_src0, v_src1); + v_dst = internal::vqsubq(v_max, v_min); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + typename internal::VecTraits::vec64 v_min = internal::vmin(v_src0, v_src1); + typename internal::VecTraits::vec64 v_max = internal::vmax(v_src0, v_src1); + v_dst = internal::vqsub(v_max, v_min); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = internal::saturate_cast(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]); + } +}; + +} // namespace + +#endif + +void absDiff(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiff()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u16 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiff()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiffSigned()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiffSigned()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + s32 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiffSigned()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiff()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/accumulate.cpp b/3rdparty/carotene/src/accumulate.cpp new file mode 100644 index 0000000000..ee9ce22d35 --- /dev/null +++ b/3rdparty/carotene/src/accumulate.cpp @@ -0,0 +1,408 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + + +#include "common.hpp" +#include "vtransform.hpp" + +#include + +namespace CAROTENE_NS { + +void accumulate(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + internal::prefetch(dst + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vld1q_s16(dst + j); + int16x8_t v_dst1 = vld1q_s16(dst + j + 8); + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + v_dst0 = vqaddq_s16(v_dst0, v_src0); + v_dst1 = vqaddq_s16(v_dst1, v_src1); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src)); + int16x8_t v_dst = vld1q_s16(dst + j); + v_dst = vqaddq_s16(v_dst, v_src16); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = internal::saturate_cast(src[j] + dst[j]); + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +#ifdef CAROTENE_NEON + +namespace { + +template +void accumulateSquareConst(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + internal::prefetch(dst + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8); + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0); + v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))), + vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0)))); + + v_srclo = vget_low_s16(v_src1); + v_srchi = vget_high_s16(v_src1); + v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))), + vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1)))); + + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); + int16x8_t v_dst = vld1q_s16(dst + j); + int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src); + v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))), + vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst)))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + s32 srcVal = src[j]; + dst[j] = internal::saturate_cast(dst[j] + ((srcVal * srcVal) >> shift)); + } + } +} + +template <> +void accumulateSquareConst<0>(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + internal::prefetch(dst + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8); + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0); + v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))), + vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0)))); + + v_srclo = vget_low_s16(v_src1); + v_srchi = vget_high_s16(v_src1); + v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))), + vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1)))); + + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); + int16x8_t v_dst = vld1q_s16(dst + j); + int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src); + v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))), + vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst)))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + s32 srcVal = src[j]; + dst[j] = internal::saturate_cast(dst[j] + srcVal * srcVal); + } + } +} + +typedef void (* accumulateSquareConstFunc)(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride); + +} // namespace + +#endif + +void accumulateSquare(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + u32 shift) +{ + if (shift >= 16) + { + for (size_t i = 0; i < size.height; ++i) + { + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + std::memset(dst, 0, sizeof(s16) * size.width); + } + return; + } + + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + // this ugly contruction is needed to avoid: + // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant + // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1); + + accumulateSquareConstFunc funcs[16] = + { + accumulateSquareConst<0>, + accumulateSquareConst<1>, + accumulateSquareConst<2>, + accumulateSquareConst<3>, + accumulateSquareConst<4>, + accumulateSquareConst<5>, + accumulateSquareConst<6>, + accumulateSquareConst<7>, + accumulateSquareConst<8>, + accumulateSquareConst<9>, + accumulateSquareConst<10>, + accumulateSquareConst<11>, + accumulateSquareConst<12>, + accumulateSquareConst<13>, + accumulateSquareConst<14>, + accumulateSquareConst<15> + }, func = funcs[shift]; + + func(size, srcBase, srcStride, dstBase, dstStride); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)shift; +#endif +} + +#ifdef CAROTENE_NEON + +namespace { + +struct AccumulateWeightedHalf +{ + typedef u8 type; + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + v_dst = vhaddq_u8(v_src0, v_src1); + } + + void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1, + uint8x8_t & v_dst) const + { + v_dst = vhadd_u8(v_src0, v_src1); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = ((u16)(src0[0]) + src1[0]) >> 1; + } +}; + +struct AccumulateWeighted +{ + typedef u8 type; + + float alpha, beta; + float32x4_t v_alpha, v_beta; + + explicit AccumulateWeighted(float _alpha) : + alpha(_alpha), beta(1 - _alpha) + { + v_alpha = vdupq_n_f32(alpha); + v_beta = vdupq_n_f32(beta); + } + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p)))); + float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p)))); + uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p)))); + v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p)))); + uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + + v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)); + } + + void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1, + uint8x8_t & v_dst) const + { + uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1); + + float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))); + float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))); + uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + + v_dst = vmovn_u16(_v_dst); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = beta * src1[0] + alpha * src0[0]; + } +}; + +} // namespace + +#endif + +void accumulateWeighted(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + f32 alpha) +{ + if (alpha == 0.0f) + return; + if (alpha == 1.0f) + { + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + std::memcpy(dst, src, sizeof(u8) * size.width); + } + return; + } + + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + // in this case we can use the following scheme: + // dst[p] = (src[p] + dst[p]) >> 1 + // which is faster + if (alpha == 0.5f) + { + internal::vtransform(size, + srcBase, srcStride, + dstBase, dstStride, + dstBase, dstStride, + AccumulateWeightedHalf()); + + return; + } + + internal::vtransform(size, + srcBase, srcStride, + dstBase, dstStride, + dstBase, dstStride, + AccumulateWeighted(alpha)); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)alpha; +#endif +} + +} //namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/add.cpp b/3rdparty/carotene/src/add.cpp new file mode 100644 index 0000000000..e8ace53122 --- /dev/null +++ b/3rdparty/carotene/src/add.cpp @@ -0,0 +1,475 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +struct AddWrap +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vaddq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vadd(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = (T)((WT)src0[0] + (WT)src1[0]); + } +}; + +template +struct AddSaturate +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vqaddq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vqadd(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = internal::saturate_cast((WT)src0[0] + (WT)src1[0]); + } +}; + +} // namespace + +#endif + +void add(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16); + uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16); + vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10))); + vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10))); + vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11))); + vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11))); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src0 = vld1_u8(src0 + j); + uint8x8_t v_src1 = vld1_u8(src1 + j); + vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1)); + } + + for (; j < size.width; j++) + dst[j] = (u16)src0[j] + (u16)src1[j]; + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void add(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (policy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vqaddq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = internal::saturate_cast((s32)src0[j] + (s32)src1[j]); + } + else + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vaddq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = (s16)((s32)src0[j] + (s32)src1[j]); + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/add_weighted.cpp b/3rdparty/carotene/src/add_weighted.cpp new file mode 100644 index 0000000000..1f89fb5372 --- /dev/null +++ b/3rdparty/carotene/src/add_weighted.cpp @@ -0,0 +1,265 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +using namespace internal; + +template struct TypeTraits; +template <> struct TypeTraits< u8> { typedef u16 wide; typedef u8 unsign; typedef uint8x16_t vec128; }; +template <> struct TypeTraits< s8> { typedef s16 wide; typedef u8 unsign; typedef int8x16_t vec128; }; +template <> struct TypeTraits { typedef u32 wide; typedef u8 narrow; typedef u16 unsign; typedef uint16x8_t vec128; }; +template <> struct TypeTraits { typedef s32 wide; typedef s8 narrow; typedef u16 unsign; typedef int16x8_t vec128; }; +template <> struct TypeTraits { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef uint32x4_t vec128; }; +template <> struct TypeTraits { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef int32x4_t vec128; }; +template <> struct TypeTraits { typedef f64 wide; typedef float32x4_t vec128; }; + +template struct wAdd +{ + typedef T type; + + f32 alpha, beta, gamma; + typedef typename TypeTraits::wide wtype; + wAdd wideAdd; + wAdd(f32 _alpha, f32 _beta, f32 _gamma): + alpha(_alpha), beta(_beta), gamma(_gamma), + wideAdd(_alpha, _beta, _gamma) {} + + void operator() (const typename VecTraits::vec128 & v_src0, + const typename VecTraits::vec128 & v_src1, + typename VecTraits::vec128 & v_dst) const + { + typename VecTraits::vec128 vrl, vrh; + wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl); + wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh); + + v_dst = vcombine(vqmovn(vrl), vqmovn(vrh)); + } + + void operator() (const typename VecTraits::vec64 & v_src0, + const typename VecTraits::vec64 & v_src1, + typename VecTraits::vec64 & v_dst) const + { + typename VecTraits::vec128 vr; + wideAdd(vmovl(v_src0), vmovl(v_src1), vr); + + v_dst = vqmovn(vr); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = saturate_cast(alpha*src0[0] + beta*src1[0] + gamma); + } +}; + +template <> struct wAdd +{ + typedef s32 type; + + f32 alpha, beta, gamma; + float32x4_t valpha, vbeta, vgamma; + wAdd(f32 _alpha, f32 _beta, f32 _gamma): + alpha(_alpha), beta(_beta), gamma(_gamma) + { + valpha = vdupq_n_f32(_alpha); + vbeta = vdupq_n_f32(_beta); + vgamma = vdupq_n_f32(_gamma + 0.5); + } + + void operator() (const typename VecTraits::vec128 & v_src0, + const typename VecTraits::vec128 & v_src1, + typename VecTraits::vec128 & v_dst) const + { + float32x4_t vs1 = vcvtq_f32_s32(v_src0); + float32x4_t vs2 = vcvtq_f32_s32(v_src1); + + vs1 = vmlaq_f32(vgamma, vs1, valpha); + vs1 = vmlaq_f32(vs1, vs2, vbeta); + v_dst = vcvtq_s32_f32(vs1); + } + + void operator() (const typename VecTraits::vec64 & v_src0, + const typename VecTraits::vec64 & v_src1, + typename VecTraits::vec64 & v_dst) const + { + float32x2_t vs1 = vcvt_f32_s32(v_src0); + float32x2_t vs2 = vcvt_f32_s32(v_src1); + + vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha)); + vs1 = vmla_f32(vs1, vs2, vget_low(vbeta)); + v_dst = vcvt_s32_f32(vs1); + } + + void operator() (const s32 * src0, const s32 * src1, s32 * dst) const + { + dst[0] = saturate_cast(alpha*src0[0] + beta*src1[0] + gamma); + } +}; + +template <> struct wAdd +{ + typedef u32 type; + + f32 alpha, beta, gamma; + float32x4_t valpha, vbeta, vgamma; + wAdd(f32 _alpha, f32 _beta, f32 _gamma): + alpha(_alpha), beta(_beta), gamma(_gamma) + { + valpha = vdupq_n_f32(_alpha); + vbeta = vdupq_n_f32(_beta); + vgamma = vdupq_n_f32(_gamma + 0.5); + } + + void operator() (const typename VecTraits::vec128 & v_src0, + const typename VecTraits::vec128 & v_src1, + typename VecTraits::vec128 & v_dst) const + { + float32x4_t vs1 = vcvtq_f32_u32(v_src0); + float32x4_t vs2 = vcvtq_f32_u32(v_src1); + + vs1 = vmlaq_f32(vgamma, vs1, valpha); + vs1 = vmlaq_f32(vs1, vs2, vbeta); + v_dst = vcvtq_u32_f32(vs1); + } + + void operator() (const typename VecTraits::vec64 & v_src0, + const typename VecTraits::vec64 & v_src1, + typename VecTraits::vec64 & v_dst) const + { + float32x2_t vs1 = vcvt_f32_u32(v_src0); + float32x2_t vs2 = vcvt_f32_u32(v_src1); + + vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha)); + vs1 = vmla_f32(vs1, vs2, vget_low(vbeta)); + v_dst = vcvt_u32_f32(vs1); + } + + void operator() (const u32 * src0, const u32 * src1, u32 * dst) const + { + dst[0] = saturate_cast(alpha*src0[0] + beta*src1[0] + gamma); + } +}; + +template <> struct wAdd +{ + typedef f32 type; + + f32 alpha, beta, gamma; + float32x4_t valpha, vbeta, vgamma; + wAdd(f32 _alpha, f32 _beta, f32 _gamma): + alpha(_alpha), beta(_beta), gamma(_gamma) + { + valpha = vdupq_n_f32(_alpha); + vbeta = vdupq_n_f32(_beta); + vgamma = vdupq_n_f32(_gamma + 0.5); + } + + void operator() (const typename VecTraits::vec128 & v_src0, + const typename VecTraits::vec128 & v_src1, + typename VecTraits::vec128 & v_dst) const + { + float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha); + v_dst = vmlaq_f32(vs1, v_src1, vbeta); + } + + void operator() (const typename VecTraits::vec64 & v_src0, + const typename VecTraits::vec64 & v_src1, + typename VecTraits::vec64 & v_dst) const + { + float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha)); + v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta)); + + } + + void operator() (const f32 * src0, const f32 * src1, f32 * dst) const + { + dst[0] = alpha*src0[0] + beta*src1[0] + gamma; + } +}; + +} // namespace + +#define IMPL_ADDWEIGHTED(type) \ +void addWeighted(const Size2D &size, \ + const type * src0Base, ptrdiff_t src0Stride, \ + const type * src1Base, ptrdiff_t src1Stride, \ + type * dstBase, ptrdiff_t dstStride, \ + f32 alpha, f32 beta, f32 gamma) \ +{ \ + internal::assertSupportedConfiguration(); \ + wAdd wgtAdd(alpha, \ + beta, \ + gamma); \ + internal::vtransform(size, \ + src0Base, src0Stride, \ + src1Base, src1Stride, \ + dstBase, dstStride, \ + wgtAdd); \ +} + +#else + +#define IMPL_ADDWEIGHTED(type) \ +void addWeighted(const Size2D &, \ + const type *, ptrdiff_t, \ + const type *, ptrdiff_t, \ + type *, ptrdiff_t, \ + f32, f32, f32) \ +{ \ + internal::assertSupportedConfiguration(); \ +} + +#endif + +IMPL_ADDWEIGHTED(u8) +IMPL_ADDWEIGHTED(s8) +IMPL_ADDWEIGHTED(u16) +IMPL_ADDWEIGHTED(s16) +IMPL_ADDWEIGHTED(u32) +IMPL_ADDWEIGHTED(s32) +IMPL_ADDWEIGHTED(f32) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/bitwise.cpp b/3rdparty/carotene/src/bitwise.cpp new file mode 100644 index 0000000000..ee00775111 --- /dev/null +++ b/3rdparty/carotene/src/bitwise.cpp @@ -0,0 +1,225 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +struct BitwiseAnd +{ + typedef u8 type; + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + v_dst = vandq_u8(v_src0, v_src1); + } + + void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1, + uint8x8_t & v_dst) const + { + v_dst = vand_u8(v_src0, v_src1); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = src0[0] & src1[0]; + } +}; + +struct BitwiseOr +{ + typedef u8 type; + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + v_dst = vorrq_u8(v_src0, v_src1); + } + + void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1, + uint8x8_t & v_dst) const + { + v_dst = vorr_u8(v_src0, v_src1); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = src0[0] | src1[0]; + } +}; + +struct BitwiseXor +{ + typedef u8 type; + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + v_dst = veorq_u8(v_src0, v_src1); + } + + void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1, + uint8x8_t & v_dst) const + { + v_dst = veor_u8(v_src0, v_src1); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = src0[0] ^ src1[0]; + } +}; + +#endif + +void bitwiseNot(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16); + uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1); + vst1q_u8(dst + j, v_dst0); + vst1q_u8(dst + j + 16, v_dst1); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + uint8x8_t v_dst = vmvn_u8(v_src); + vst1_u8(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + dst[j] = ~src[j]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bitwiseAnd(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, BitwiseAnd()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bitwiseOr(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, BitwiseOr()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bitwiseXor(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, BitwiseXor()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/blur.cpp b/3rdparty/carotene/src/blur.cpp new file mode 100644 index 0000000000..798cce5a71 --- /dev/null +++ b/3rdparty/carotene/src/blur.cpp @@ -0,0 +1,1337 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" +#include "saturate_cast.hpp" + +namespace CAROTENE_NS { + +bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 8 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE); +} + +void blur3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isBlur3x3Supported(size, border)); +#ifdef CAROTENE_NEON + const int16x8_t v_scale = vmovq_n_s16(3640); + const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3); + const uint16x8_t v_zero = vdupq_n_u16(0); + const uint8x8_t v_border = vdup_n_u8(borderValue); + + uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero; + uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero; + + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + s16 prevx = 0, currx = 0, nextx = 0; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8); + + // perform vertical convolution + for ( ; x <= bwidth; x += 8) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 8 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx = borderValue; + else + prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue); + + currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue); + } + + // make shift + if (x) + { + tprev = tcurr; + tcurr = tnext; + } + + // and calculate next value + tnext = vaddw_u8(vaddl_u8(x0, x1), x2); + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr = v_border_x3; + else if (border == BORDER_MODE_REPLICATE) + tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0)); + + continue; + } + + // combine 3 "shifted" vectors + t0 = vextq_u16(tprev, tcurr, 7); + t1 = tcurr; + t2 = vextq_u16(tcurr, tnext, 1); + + // and add them + t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2)); + + int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), v_scale); + uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0)); + vst1_u8(drow + x - 8, it0); + } + + x -= 8; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + nextx = borderValue * 3; + else if (border == BORDER_MODE_REPLICATE) + nextx = srow2[x] + srow1[x] + srow0[x]; + } + else + nextx = (srow2 ? srow2[x + 1] : borderValue) + + srow1[x + 1] + + (srow0 ? srow0[x + 1] : borderValue); + + f32 val = (prevx + currx + nextx) * (1 / 9.f) + 0.5f; + drow[x] = internal::saturate_cast((s32)val); + + // make shift + prevx = currx; + currx = nextx; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border) +{ + return isSupportedConfiguration() && + cn > 0 && cn <= 4 && + size.width*cn >= 8 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REPLICATE); +} + +void blur3x3(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue) +{ + internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON +//#define FLOAT_VARIANT_1_9 +#ifdef FLOAT_VARIANT_1_9 + float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0); + float32x4_t v0_5 = vdupq_n_f32 (.5); +#else + const int16x8_t vScale = vmovq_n_s16(3640); +#endif + + size_t colsn = size.width*cn; + + std::vector _tmp; + u8 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 2*cn, borderValue); + tmp = &_tmp[cn]; + } + + uint16x8_t tprev = vdupq_n_u16(0x0); + uint16x8_t tcurr = tprev; + uint16x8_t tnext = tprev; + uint16x8_t t0, t1, t2; + if(cn == 1) + { + for( size_t y = 0; y < size.height; y++ ) + { + const u8* srow0; + const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* srow2; + u8* drow = internal::getRowPtr(dstBase, dstStride, y); + if (borderType == BORDER_MODE_REFLECT101) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2); + } else if (borderType == BORDER_MODE_CONSTANT) { + srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + } + + // do vertical convolution + size_t x = 0; + const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = vld1_u8(srow2 + x); + + tprev = tcurr; + tcurr = tnext; + tnext = vaddw_u8(vaddl_u8(x0, x1), x2); + + if(!x) { + tcurr = tnext; + + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + else // borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7); + } + continue; + } + + t0 = vextq_u16(tprev, tcurr, 7); + t1 = tcurr; + t2 = vextq_u16(tcurr, tnext, 1); + + t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2)); + +#ifdef FLOAT_VARIANT_1_9 + uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0)); + uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); + float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1)); + float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2)); + tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); + tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); + vst1_u8(drow + x - 8, vmovn_u16(t0)); +#else + int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale); + uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0)); + vst1_u8(drow + x - 8, it0); +#endif + } + + x -= 8; + if(x == colsn){ + x--; + } + s16 prevx, rowx, nextx; + prevx = srow2[x-1] + srow1[x-1] + srow0[x-1]; + rowx = srow2[x] + srow1[x] + srow0[x]; + for( ; x < colsn; x++ ) + { + if(x+1 >= colsn) { + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + nextx = borderValue; + } else if (borderType == BORDER_MODE_REFLECT101) + { + nextx = srow2[x-1] + srow1[x-1] + srow0[x-1]; + } else { + nextx = srow2[x] + srow1[x] + srow0[x]; + } + } else { + nextx = srow2[x+1] + srow1[x+1] + srow0[x+1]; + } + *(drow+x) = internal::saturate_cast((prevx + rowx + nextx)*(1/9.)); + prevx = rowx; + rowx = nextx; + } + } + } + else + { + for( size_t y = 0; y < size.height; y++ ) + { + const u8* srow0; + const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* srow2; + u8* drow = internal::getRowPtr(dstBase, dstStride, y); + if (borderType == BORDER_MODE_REFLECT101) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2); + } else if (borderType == BORDER_MODE_CONSTANT) { + srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + } + + // do vertical convolution + size_t x = 0; + const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = vld1_u8(srow2 + x); + + tprev = tcurr; + tcurr = tnext; + tnext = vaddw_u8(vaddl_u8(x0, x1), x2); + + if(!x) { + tcurr = tnext; + + // make border + switch(cn) + { + case 2: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 6); + } + else + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + break; + case 3: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tcurr, 7); + } + else + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 7); + } + break; + case 4: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 4); + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType != BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7); + } + break; + } + continue; + } + + if(cn==2) + t0 = vextq_u16(tprev, tcurr, 6); + else if(cn==3) + t0 = vextq_u16(tprev, tcurr, 5); + else if(cn==4) + t0 = vextq_u16(tprev, tcurr, 4); + + t1 = tcurr; + + if(cn==2) + t2 = vextq_u16(tcurr, tnext, 2); + else if(cn==3) + t2 = vextq_u16(tcurr, tnext, 3); + else if(cn==4) + t2 = vextq_u16(tcurr, tnext, 4); + + t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2)); + +#ifdef FLOAT_VARIANT_1_9 + uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0)); + uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); + float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1)); + float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2)); + tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); + tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); + vst1_u8(drow + x - 8, vmovn_u16(t0)); +#else + int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale); + uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0)); + vst1_u8(drow + x - 8, it0); +#endif + } + + x -= 8; + if(x == colsn){ + x -= cn; + } + s16 prevx[4], rowx[4], nextx[4]; + for( s32 k = 0; k < cn; k++ ) + { + prevx[(k + x%cn)%cn] = srow2[x+k-cn] + srow1[x+k-cn] + srow0[x+k-cn]; + rowx[(k + x%cn)%cn] = srow2[x+k] + srow1[x+k] + srow0[x+k]; + } + for( ; x < colsn; x++ ) + { + size_t xx = x%cn; + if(x+cn >= colsn) { + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + nextx[xx] = borderValue; + } else if (borderType == BORDER_MODE_REFLECT101) + { + nextx[xx] = srow2[x-cn] + srow1[x-cn] + srow0[x-cn]; + } else { + nextx[xx] = srow2[x] + srow1[x] + srow0[x]; + } + } else { + nextx[xx] = srow2[x+cn] + srow1[x+cn] + srow0[x+cn]; + } + *(drow+x) = internal::saturate_cast((prevx[xx] + rowx[xx] + nextx[xx])*(1/9.)); + prevx[xx] = rowx[xx]; + rowx[xx] = nextx[xx]; + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +void blur5x5(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue) +{ + internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON +#define FLOAT_VARIANT_1_25 +#ifdef FLOAT_VARIANT_1_25 + float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f); + float32x4_t v0_5 = vdupq_n_f32 (.5f); +#else + const int16x8_t vScale = vmovq_n_s16(1310); +#endif + size_t colsn = size.width*cn; + + std::vector _tmp; + u8 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 2*cn, borderValue); + tmp = &_tmp[cn]; + } + + uint16x8_t tprev = vdupq_n_u16(0x0); + uint16x8_t tcurr = tprev; + uint16x8_t tnext = tprev; + uint16x8_t t0, t1, t2, t3, t4; + for( size_t y = 0; y < size.height; y++ ) + { + const u8 *srow0, *srow1; + const u8 *srow2 = internal::getRowPtr(srcBase, srcStride, y); + const u8 *srow3, *srow4; + u8 *drow = internal::getRowPtr(dstBase, dstStride, y); + if (borderType == BORDER_MODE_REFLECT101) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 2-y); + srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1); + srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2); + srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-4-y); + } else if (borderType == BORDER_MODE_CONSTANT) { + srow0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp; + srow1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + srow3 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + srow4 = y < size.height-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp; + } else if (borderType == BORDER_MODE_REFLECT) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 1-y); + srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-3-y); + } else { // BORDER_MODE_REPLICATE + srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0); + srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : size.height-1); + } + + // do vertical convolution + size_t x = 0; + const size_t bcols = y + 3 < size.height ? colsn : (colsn - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + internal::prefetch(srow3 + x); + internal::prefetch(srow4 + x); + + uint8x8_t x0 = vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = vld1_u8(srow2 + x); + uint8x8_t x3 = vld1_u8(srow3 + x); + uint8x8_t x4 = vld1_u8(srow4 + x); + + tprev = tcurr; + tcurr = tnext; + tnext = vaddw_u8(vaddq_u16(vaddl_u8(x0, x1), vaddl_u8(x2, x3)), x4); + + if(!x) { + tcurr = tnext; + + if(borderType == BORDER_MODE_REFLECT101 && size.width < 3) + { + x = 8; + break; + } + + // make border + switch(cn) + { + case 1: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7); + } + else + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7); + } + break; + case 2: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 4); + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 4); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + else + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + break; + case 3: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 2); + tcurr = vsetq_lane_u16(borderValue, tcurr, 3); + tcurr = vsetq_lane_u16(borderValue, tcurr, 4); + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 6),tcurr, 2); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 7),tprev, 3); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tprev, 5); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 6); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 7); + s16 lane8 = srow4[8] + srow3[8] + srow2[8] + srow1[8] + srow0[8]; + tcurr = vsetq_lane_u16(lane8,tprev, 4); + } + else if (borderType == BORDER_MODE_REFLECT) + { + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 2); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 3); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 4); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7); + } + else + { + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 2); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 3); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 4); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7); + } + break; + case 4: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 0); + tcurr = vsetq_lane_u16(borderValue, tcurr, 1); + tcurr = vsetq_lane_u16(borderValue, tcurr, 2); + tcurr = vsetq_lane_u16(borderValue, tcurr, 3); + tcurr = vsetq_lane_u16(borderValue, tcurr, 4); + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + s16 lane8 = srow4[ 8] + srow3[ 8] + srow2[ 8] + srow1[ 8] + srow0[ 8]; + s16 lane9 = srow4[ 9] + srow3[ 9] + srow2[ 9] + srow1[ 9] + srow0[ 9]; + s16 lane10 = srow4[10] + srow3[10] + srow2[10] + srow1[10] + srow0[10]; + s16 lane11 = srow4[11] + srow3[11] + srow2[11] + srow1[11] + srow0[11]; + tprev = vsetq_lane_u16( lane8,tcurr, 0); + tprev = vsetq_lane_u16( lane9,tprev, 1); + tprev = vsetq_lane_u16(lane10,tprev, 2); + tcurr = vsetq_lane_u16(lane11,tprev, 3); + } + else if (borderType == BORDER_MODE_REFLECT) + { + tcurr = vcombine_u16(vget_high_u16(tcurr),vget_low_u16(tcurr));//swap 64-bit parts + } + else + { + tcurr = vcombine_u16(vget_low_u16(tcurr),vget_low_u16(tcurr));//double 64-bit part + } + break; + } + continue; + } + switch(cn) + { + case 1: + t0 = vextq_u16(tprev, tcurr, 6); + t1 = vextq_u16(tprev, tcurr, 7); + t2 = tcurr; + t3 = vextq_u16(tcurr, tnext, 1); + t4 = vextq_u16(tcurr, tnext, 2); + break; + case 2: + t0 = vextq_u16(tprev, tcurr, 4); + t1 = vextq_u16(tprev, tcurr, 6); + t2 = tcurr; + t3 = vextq_u16(tcurr, tnext, 2); + t4 = vextq_u16(tcurr, tnext, 4); + break; + case 3: + t0 = vextq_u16(tprev, tcurr, 2); + t1 = vextq_u16(tprev, tcurr, 5); + t2 = tcurr; + t3 = vextq_u16(tcurr, tnext, 3); + t4 = vextq_u16(tcurr, tnext, 6); + break; + case 4: + t0 = tprev; + t1 = vextq_u16(tprev, tcurr, 4); + t2 = tcurr; + t3 = vextq_u16(tcurr, tnext, 4); + t4 = tnext; + break; + default: + internal::assertSupportedConfiguration(false);//Unsupported channels number + return; + } + t0 = vqaddq_u16(vqaddq_u16(vqaddq_u16(t0, t1), vqaddq_u16(t2, t3)), t4); + +#ifdef FLOAT_VARIANT_1_25 + uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0)); + uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); + float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1)); + float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2)); + tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); + tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); + vst1_u8(drow + x - 8, vmovn_u16(t0)); +#else + int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale); + uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0)); + vst1_u8(drow + x - 8, it0); +#endif + } + + x -= 8; + if(x == colsn){ + x -= cn; + } + s16 pprevx[4], prevx[4], rowx[4], nextx[4], nnextx[4]; + ptrdiff_t px = x / cn; + for( s32 k = 0; k < cn; k++ ) + { + ptrdiff_t ploc; + ploc = internal::borderInterpolate(px-2, size.width, borderType); + pprevx[k] = ploc < 0 ? 5*borderValue : + srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k]; + + ploc = internal::borderInterpolate(px-1, size.width, borderType); + prevx[k] = ploc < 0 ? 5*borderValue : + srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k]; + + rowx[k] = srow4[px*cn+k] + srow3[px*cn+k] + srow2[px*cn+k] + srow1[px*cn+k] + srow0[px*cn+k]; + + ploc = internal::borderInterpolate(px+1, size.width, borderType); + nextx[k] = ploc < 0 ? 5*borderValue : + srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k]; + } + x = px*cn; + for( ; x < colsn; x+=cn, px++ ) + { + for( s32 k = 0; k < cn; k++ ) + { + ptrdiff_t ploc = internal::borderInterpolate(px+2, size.width, borderType); + nnextx[k] = ploc < 0 ? 5*borderValue : + srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k]; + *(drow+x+k) = internal::saturate_cast((pprevx[k] + prevx[k] + rowx[k] + nextx[k] +nnextx[k])*(1/25.)); + pprevx[k] = prevx[k]; + prevx[k] = rowx[k]; + rowx[k] = nextx[k]; + nextx[k] = nnextx[k]; + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border) +{ + return isSupportedConfiguration() && + cn > 0 && cn <= 4 && + size.width*cn >= 4 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REPLICATE || + border == BORDER_MODE_WRAP); +} + +void blur3x3(const Size2D &size, s32 cn, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, f32 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isBlurF32Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + f32 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 2*cn, borderValue); + tmp = &_tmp[cn]; + } + + ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //2-line buffer + std::vector _buf(4*(cn * (size.width + 2) + 32 / sizeof(f32))); + f32* lanea = internal::alignPtr(&_buf[cn], 32); + f32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32); + + f32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32); + f32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = borderValue; + lanea[colsn+k] = borderValue; + laneA[-cn+k] = borderValue; + laneA[colsn+k] = borderValue; + laneb[-cn+k] = borderValue; + laneb[colsn+k] = borderValue; + laneB[-cn+k] = borderValue; + laneB[colsn+k] = borderValue; + } + + size_t i = 0; + f32* dsta = internal::getRowPtr(dstBase, dstStride, 0); + for (; i < size.height-1; i+=2) + { + //vertical convolution + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i); + const f32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1); + const f32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(ln1 + x); + internal::prefetch(ln2 + x); + internal::prefetch(ln0 + x); + internal::prefetch(ln3 + x); +box3x3f32_vert: + float32x4_t v1 = vld1q_f32(ln1 + x); + float32x4_t v2 = vld1q_f32(ln2 + x); + float32x4_t v0 = vld1q_f32(ln0 + x); + float32x4_t v3 = vld1q_f32(ln3 + x); + + float32x4_t v = vaddq_f32(v1, v2); + float32x4_t w0 = vaddq_f32(v, v0); + float32x4_t w1 = vaddq_f32(v, v3); + + vst1q_f32(lanea + x, w0); + vst1q_f32(laneb + x, w1); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_vert; + } + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = lanea[idx_l + k]; + lanea[colsn+k] = lanea[idx_r + k]; + laneb[-cn+k] = laneb[idx_l + k]; + laneb[colsn+k] = laneb[idx_r + k]; + } + + //horizontal convolution (2 lines from previous iteration) + if (i > 0) + { + f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); +box3x3f32_horiz: + float32x4_t lane0a = vld1q_f32(laneA + x - cn); + float32x4_t lane2a = vld1q_f32(laneA + x + cn); + float32x4_t lane1a = vld1q_f32(laneA + x); + + float32x4_t lane0b = vld1q_f32(laneB + x - cn); + float32x4_t lane2b = vld1q_f32(laneB + x + cn); + float32x4_t lane1b = vld1q_f32(laneB + x); + + float32x4_t va = vaddq_f32(lane0a, lane2a); + float32x4_t vb = vaddq_f32(lane0b, lane2b); + float32x4_t wa = vaddq_f32(va, lane1a); + float32x4_t wb = vaddq_f32(vb, lane1b); + + vst1q_f32(dsta + x, wa); + vst1q_f32(dstb + x, wb); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_horiz; + } + dsta = internal::getRowPtr(dstBase, dstStride, i); + } + + std::swap(lanea, laneA); + std::swap(laneb, laneB); + } + + //last line + if(i < size.height) + { + //vertical convolution + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i); + const f32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(ln0 + x); + internal::prefetch(ln1 + x); + internal::prefetch(ln2 + x); +box3x3f32_vert_ll: + float32x4_t v0 = vld1q_f32(ln0+x); + float32x4_t v1 = vld1q_f32(ln1+x); + float32x4_t v2 = vld1q_f32(ln2+x); + + float32x4_t v = vaddq_f32(v0, v1); + float32x4_t w = vaddq_f32(v, v2); + + vst1q_f32(lanea + x, w); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_vert_ll; + } + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = lanea[idx_l + k]; + lanea[colsn+k] = lanea[idx_r + k]; + } + + //horizontal convolution (last 3 lines) + x = 0; + f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + f32* dstc = internal::getRowPtr(dstBase, dstStride, i); + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); + internal::prefetch(lanea + x + cn); +box3x3f32_horiz_ll: + float32x4_t lane0a = vld1q_f32(laneA + x - cn); + float32x4_t lane2a = vld1q_f32(laneA + x + cn); + float32x4_t lane1a = vld1q_f32(laneA + x); + + float32x4_t lane0b = vld1q_f32(laneB + x - cn); + float32x4_t lane2b = vld1q_f32(laneB + x + cn); + float32x4_t lane1b = vld1q_f32(laneB + x); + + float32x4_t lane0c = vld1q_f32(lanea + x - cn); + float32x4_t lane2c = vld1q_f32(lanea + x + cn); + float32x4_t lane1c = vld1q_f32(lanea + x); + + float32x4_t va = vaddq_f32(lane0a, lane2a); + float32x4_t vb = vaddq_f32(lane0b, lane2b); + float32x4_t vc = vaddq_f32(lane0c, lane2c); + float32x4_t wa = vaddq_f32(va, lane1a); + float32x4_t wb = vaddq_f32(vb, lane1b); + float32x4_t wc = vaddq_f32(vc, lane1c); + + vst1q_f32(dsta + x, wa); + vst1q_f32(dstb + x, wb); + vst1q_f32(dstc + x, wc); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_horiz_ll; + } + } + else + { + //horizontal convolution (last 2 lines) + f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); +box3x3f32_horiz_last2: + float32x4_t lane0a = vld1q_f32(laneA + x - cn); + float32x4_t lane2a = vld1q_f32(laneA + x + cn); + float32x4_t lane1a = vld1q_f32(laneA + x); + + float32x4_t lane0b = vld1q_f32(laneB + x - cn); + float32x4_t lane2b = vld1q_f32(laneB + x + cn); + float32x4_t lane1b = vld1q_f32(laneB + x); + + float32x4_t va = vaddq_f32(lane0a, lane2a); + float32x4_t vb = vaddq_f32(lane0b, lane2b); + float32x4_t wa = vaddq_f32(va, lane1a); + float32x4_t wb = vaddq_f32(vb, lane1b); + + vst1q_f32(dsta + x, wa); + vst1q_f32(dstb + x, wb); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_horiz_last2; + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border) +{ + return isSupportedConfiguration() && + cn > 0 && cn <= 4 && + size.width*cn >= 4 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REPLICATE || + border == BORDER_MODE_WRAP); +} + +void blur3x3(const Size2D &size, s32 cn, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s32 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isBlurS32Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + s32 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 2*cn, borderValue); + tmp = &_tmp[cn]; + } + + ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //2-line buffer + std::vector _buf(4*(cn * (size.width + 2) + 32 / sizeof(s32))); + s32* lanea = internal::alignPtr(&_buf[cn], 32); + s32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32); + + s32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32); + s32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = borderValue; + lanea[colsn+k] = borderValue; + laneA[-cn+k] = borderValue; + laneA[colsn+k] = borderValue; + laneb[-cn+k] = borderValue; + laneb[colsn+k] = borderValue; + laneB[-cn+k] = borderValue; + laneB[colsn+k] = borderValue; + } + + size_t i = 0; + s32* dsta = internal::getRowPtr(dstBase, dstStride, 0); + for (; i < size.height-1; i+=2) + { + //vertical convolution + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i); + const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1); + const s32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(ln1 + x); + internal::prefetch(ln2 + x); + internal::prefetch(ln0 + x); + internal::prefetch(ln3 + x); +box3x3s32_vert: + int32x4_t v1 = vld1q_s32(ln1 + x); + int32x4_t v2 = vld1q_s32(ln2 + x); + int32x4_t v0 = vld1q_s32(ln0 + x); + int32x4_t v3 = vld1q_s32(ln3 + x); + + int32x4_t v = vaddq_s32(v1, v2); + int32x4_t w0 = vaddq_s32(v, v0); + int32x4_t w1 = vaddq_s32(v, v3); + + vst1q_s32(lanea + x, w0); + vst1q_s32(laneb + x, w1); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_vert; + } + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = lanea[idx_l + k]; + lanea[colsn+k] = lanea[idx_r + k]; + laneb[-cn+k] = laneb[idx_l + k]; + laneb[colsn+k] = laneb[idx_r + k]; + } + + //horizontal convolution (2 lines from previous iteration) + if (i > 0) + { + s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); +box3x3s32_horiz: + int32x4_t lane0a = vld1q_s32(laneA + x - cn); + int32x4_t lane2a = vld1q_s32(laneA + x + cn); + int32x4_t lane1a = vld1q_s32(laneA + x); + + int32x4_t lane0b = vld1q_s32(laneB + x - cn); + int32x4_t lane2b = vld1q_s32(laneB + x + cn); + int32x4_t lane1b = vld1q_s32(laneB + x); + + int32x4_t va = vaddq_s32(lane0a, lane2a); + int32x4_t vb = vaddq_s32(lane0b, lane2b); + int32x4_t wa = vaddq_s32(va, lane1a); + int32x4_t wb = vaddq_s32(vb, lane1b); + + vst1q_s32(dsta + x, wa); + vst1q_s32(dstb + x, wb); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_horiz; + } + dsta = internal::getRowPtr(dstBase, dstStride, i); + } + + std::swap(lanea, laneA); + std::swap(laneb, laneB); + } + //last line + if(i < size.height) + { + //vertical convolution + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i); + const s32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(ln0 + x); + internal::prefetch(ln1 + x); + internal::prefetch(ln2 + x); +box3x3s32_vert_ll: + int32x4_t v0 = vld1q_s32(ln0+x); + int32x4_t v1 = vld1q_s32(ln1+x); + int32x4_t v2 = vld1q_s32(ln2+x); + + int32x4_t v = vaddq_s32(v0, v1); + int32x4_t w = vaddq_s32(v, v2); + + vst1q_s32(lanea + x, w); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_vert_ll; + } + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = lanea[idx_l + k]; + lanea[colsn+k] = lanea[idx_r + k]; + } + + //horizontal convolution (last 3 lines) + x = 0; + s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + s32* dstc = internal::getRowPtr(dstBase, dstStride, i); + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); + internal::prefetch(lanea + x + cn); +box3x3s32_horiz_ll: + int32x4_t lane0a = vld1q_s32(laneA + x - cn); + int32x4_t lane2a = vld1q_s32(laneA + x + cn); + int32x4_t lane1a = vld1q_s32(laneA + x); + + int32x4_t lane0b = vld1q_s32(laneB + x - cn); + int32x4_t lane2b = vld1q_s32(laneB + x + cn); + int32x4_t lane1b = vld1q_s32(laneB + x); + + int32x4_t lane0c = vld1q_s32(lanea + x - cn); + int32x4_t lane2c = vld1q_s32(lanea + x + cn); + int32x4_t lane1c = vld1q_s32(lanea + x); + + int32x4_t va = vaddq_s32(lane0a, lane2a); + int32x4_t vb = vaddq_s32(lane0b, lane2b); + int32x4_t vc = vaddq_s32(lane0c, lane2c); + int32x4_t wa = vaddq_s32(va, lane1a); + int32x4_t wb = vaddq_s32(vb, lane1b); + int32x4_t wc = vaddq_s32(vc, lane1c); + + vst1q_s32(dsta + x, wa); + vst1q_s32(dstb + x, wb); + vst1q_s32(dstc + x, wc); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_horiz_ll; + } + } + else + { + //horizontal convolution (last 2 lines) + s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); +box3x3s32_horiz_last2: + int32x4_t lane0a = vld1q_s32(laneA + x - cn); + int32x4_t lane2a = vld1q_s32(laneA + x + cn); + int32x4_t lane1a = vld1q_s32(laneA + x); + + int32x4_t lane0b = vld1q_s32(laneB + x - cn); + int32x4_t lane2b = vld1q_s32(laneB + x + cn); + int32x4_t lane1b = vld1q_s32(laneB + x); + + int32x4_t va = vaddq_s32(lane0a, lane2a); + int32x4_t vb = vaddq_s32(lane0b, lane2b); + int32x4_t wa = vaddq_s32(va, lane1a); + int32x4_t wb = vaddq_s32(vb, lane1b); + + vst1q_s32(dsta + x, wa); + vst1q_s32(dstb + x, wb); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_horiz_last2; + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +} //namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/canny.cpp b/3rdparty/carotene/src/canny.cpp new file mode 100644 index 0000000000..f61bc23e9b --- /dev/null +++ b/3rdparty/carotene/src/canny.cpp @@ -0,0 +1,773 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "saturate_cast.hpp" +#include +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON +namespace { +struct RowFilter3x3Canny +{ + inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr) + { + vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL)); + vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL)); + lookLeft = offsetk - borderxl; + lookRight = offsetk - borderxr; + } + + inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + ptrdiff_t i = 0; + for (; i < width - 8 + lookRight; i += 8) + { + internal::prefetch(src + i); + uint8x8_t l18u = vld1_u8(src + i + 1); + + uint8x8_t l2 = l18u; + uint8x8_t l0 = vext_u8(l, l18u, 6); + int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1)); + + l = l18u; + + int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0)); + int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0)); + int16x8_t ldy = vaddq_s16(l02, l1x2); + + vst1q_s16(dstx + i, ldx); + vst1q_s16(dsty + i, ldy); + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); + + int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0)); + int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1)); + int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0)); + int16x8_t taildy = vqaddq_s16(tail02, tail1x2); + + vst1q_s16(dstx + (width - 8), taildx); + vst1q_s16(dsty + (width - 8), taildy); + } + } + + uint8x8_t vfmask; + uint8x8_t vtmask; + enum { offsetk = 1}; + ptrdiff_t lookLeft; + ptrdiff_t lookRight; +}; + +template +inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width) +{ + ptrdiff_t j = 0; + for (; j <= width - 8; j += 8) + { + ColFilter3x3CannyL1Loop: + int16x8_t line0x = vld1q_s16(src0 + j); + int16x8_t line1x = vld1q_s16(src1 + j); + int16x8_t line2x = vld1q_s16(src2 + j); + int16x8_t line0y = vld1q_s16(src0 + j + width); + int16x8_t line2y = vld1q_s16(src2 + j + width); + + int16x8_t l02 = vaddq_s16(line0x, line2x); + int16x8_t l1x2 = vshlq_n_s16(line1x, 1); + int16x8_t dy = vsubq_s16(line2y, line0y); + int16x8_t dx = vaddq_s16(l1x2, l02); + + int16x8_t dya = vabsq_s16(dy); + int16x8_t dxa = vabsq_s16(dx); + int16x8_t norm = vaddq_s16(dya, dxa); + + int32x4_t normh = vmovl_s16(vget_high_s16(norm)); + int32x4_t norml = vmovl_s16(vget_low_s16(norm)); + + vst1q_s16(dsty + j, dy); + vst1q_s16(dstx + j, dx); + vst1q_s32(mag + j + 4, normh); + vst1q_s32(mag + j, norml); + } + if (j != width) + { + j = width - 8; + goto ColFilter3x3CannyL1Loop; + } +} +template <> +inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width) +{ + ptrdiff_t j = 0; + for (; j <= width - 8; j += 8) + { + ColFilter3x3CannyL2Loop: + int16x8_t line0x = vld1q_s16(src0 + j); + int16x8_t line1x = vld1q_s16(src1 + j); + int16x8_t line2x = vld1q_s16(src2 + j); + int16x8_t line0y = vld1q_s16(src0 + j + width); + int16x8_t line2y = vld1q_s16(src2 + j + width); + + int16x8_t l02 = vaddq_s16(line0x, line2x); + int16x8_t l1x2 = vshlq_n_s16(line1x, 1); + int16x8_t dy = vsubq_s16(line2y, line0y); + int16x8_t dx = vaddq_s16(l1x2, l02); + + int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx)); + int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy)); + + norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy)); + normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx)); + + vst1q_s16(dsty + j, dy); + vst1q_s16(dstx + j, dx); + vst1q_s32(mag + j, norml); + vst1q_s32(mag + j + 4, normh); + } + if (j != width) + { + j = width - 8; + goto ColFilter3x3CannyL2Loop; + } +} + +template +inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm) +{ + ptrdiff_t j = 0; + if (colscn >= 8) + { + int16x8_t vx = vld1q_s16(_dx); + int16x8_t vy = vld1q_s16(_dy); + for (; j <= colscn - 16; j+=8) + { + internal::prefetch(_dx); + internal::prefetch(_dy); + + int16x8_t vx2 = vld1q_s16(_dx + j + 8); + int16x8_t vy2 = vld1q_s16(_dy + j + 8); + + int16x8_t vabsx = vabsq_s16(vx); + int16x8_t vabsy = vabsq_s16(vy); + + int16x8_t norm = vaddq_s16(vabsx, vabsy); + + int32x4_t normh = vmovl_s16(vget_high_s16(norm)); + int32x4_t norml = vmovl_s16(vget_low_s16(norm)); + + vst1q_s32(_norm + j + 4, normh); + vst1q_s32(_norm + j + 0, norml); + + vx = vx2; + vy = vy2; + } + int16x8_t vabsx = vabsq_s16(vx); + int16x8_t vabsy = vabsq_s16(vy); + + int16x8_t norm = vaddq_s16(vabsx, vabsy); + + int32x4_t normh = vmovl_s16(vget_high_s16(norm)); + int32x4_t norml = vmovl_s16(vget_low_s16(norm)); + + vst1q_s32(_norm + j + 4, normh); + vst1q_s32(_norm + j + 0, norml); + } + for (; j < colscn; j++) + _norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j])); +} + +template <> +inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm) +{ + ptrdiff_t j = 0; + if (colscn >= 8) + { + int16x8_t vx = vld1q_s16(_dx); + int16x8_t vy = vld1q_s16(_dy); + + for (; j <= colscn - 16; j+=8) + { + internal::prefetch(_dx); + internal::prefetch(_dy); + + int16x8_t vxnext = vld1q_s16(_dx + j + 8); + int16x8_t vynext = vld1q_s16(_dy + j + 8); + + int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx)); + int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy)); + + norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy)); + normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx)); + + vst1q_s32(_norm + j + 0, norml); + vst1q_s32(_norm + j + 4, normh); + + vx = vxnext; + vy = vynext; + } + int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx)); + int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy)); + + norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy)); + normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx)); + + vst1q_s32(_norm + j + 0, norml); + vst1q_s32(_norm + j + 4, normh); + } + for (; j < colscn; j++) + _norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j]; +} + +template +inline void prepareThresh(f64 low_thresh, f64 high_thresh, + s32 &low, s32 &high) +{ + if (low_thresh > high_thresh) + std::swap(low_thresh, high_thresh); +#if defined __GNUC__ + low = (s32)low_thresh; + high = (s32)high_thresh; + low -= (low > low_thresh); + high -= (high > high_thresh); +#else + low = internal::round(low_thresh); + high = internal::round(high_thresh); + f32 ldiff = (f32)(low_thresh - low); + f32 hdiff = (f32)(high_thresh - high); + low -= (ldiff < 0); + high -= (hdiff < 0); +#endif +} +template <> +inline void prepareThresh(f64 low_thresh, f64 high_thresh, + s32 &low, s32 &high) +{ + if (low_thresh > high_thresh) + std::swap(low_thresh, high_thresh); + if (low_thresh > 0) low_thresh *= low_thresh; + if (high_thresh > 0) high_thresh *= high_thresh; +#if defined __GNUC__ + low = (s32)low_thresh; + high = (s32)high_thresh; + low -= (low > low_thresh); + high -= (high > high_thresh); +#else + low = internal::round(low_thresh); + high = internal::round(high_thresh); + f32 ldiff = (f32)(low_thresh - low); + f32 hdiff = (f32)(high_thresh - high); + low -= (ldiff < 0); + high -= (hdiff < 0); +#endif +} + +template +struct _normEstimator +{ + ptrdiff_t magstep; + ptrdiff_t dxOffset; + ptrdiff_t dyOffset; + ptrdiff_t shxOffset; + ptrdiff_t shyOffset; + std::vector buffer; + const ptrdiff_t offsetk; + ptrdiff_t borderyt, borderyb; + RowFilter3x3Canny sobelRow; + + inline _normEstimator(const Size2D &size, s32, Margin borderMargin, + ptrdiff_t &mapstep, s32** mag_buf, u8* &map): + offsetk(1), + sobelRow(std::max(0, offsetk - (ptrdiff_t)borderMargin.left), + std::max(0, offsetk - (ptrdiff_t)borderMargin.right)) + { + mapstep = size.width + 2; + magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32)); + dxOffset = mapstep * sizeof(s32)/sizeof(s16); + dyOffset = dxOffset + size.width * 1; + shxOffset = dxOffset + size.width * 2; + shyOffset = dxOffset + size.width * 3; + buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) ); + mag_buf[0] = (s32*)&buffer[0]; + mag_buf[1] = mag_buf[0] + magstep; + mag_buf[2] = mag_buf[1] + magstep; + memset(mag_buf[0], 0, mapstep * sizeof(s32)); + + map = (u8*)(mag_buf[2] + magstep); + memset(map, 1, mapstep); + memset(map + mapstep*(size.height + 1), 1, mapstep); + borderyt = std::max(0, offsetk - (ptrdiff_t)borderMargin.top); + borderyb = std::max(0, offsetk - (ptrdiff_t)borderMargin.bottom); + } + inline void firstRow(const Size2D &size, s32, + const u8 *srcBase, ptrdiff_t srcStride, + s16*, ptrdiff_t, + s16*, ptrdiff_t, + s32** mag_buf) + { + //sobelH row #0 + const u8* _src = internal::getRowPtr(srcBase, srcStride, 0); + sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width); + //sobelH row #1 + _src = internal::getRowPtr(srcBase, srcStride, 1); + sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width); + + mag_buf[1][0] = mag_buf[1][size.width+1] = 0; + if (borderyt == 0) + { + //sobelH row #-1 + _src = internal::getRowPtr(srcBase, srcStride, -1); + sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width); + + ColFilter3x3Canny( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, + ((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width); + } + else + { + ColFilter3x3Canny( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, + ((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width); + } + } + inline void nextRow(const Size2D &size, s32, + const u8 *srcBase, ptrdiff_t srcStride, + s16*, ptrdiff_t, + s16*, ptrdiff_t, + const ptrdiff_t &mapstep, s32** mag_buf, + size_t i, const s16* &_x, const s16* &_y) + { + mag_buf[2][0] = mag_buf[2][size.width+1] = 0; + if (i < size.height - borderyb) + { + const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1); + //sobelH row #i+1 + sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width); + + ColFilter3x3Canny( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset, + ((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width); + } + else if (i < size.height) + { + ColFilter3x3Canny( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, + ((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width); + } + else + memset(mag_buf[2], 0, mapstep*sizeof(s32)); + _x = ((s16*)mag_buf[1]) + dxOffset; + _y = ((s16*)mag_buf[1]) + dyOffset; + } +}; +template +struct _normEstimator +{ + std::vector buffer; + + inline _normEstimator(const Size2D &size, s32 cn, Margin, + ptrdiff_t &mapstep, s32** mag_buf, u8* &map) + { + mapstep = size.width + 2; + buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) ); + mag_buf[0] = (s32*)&buffer[0]; + mag_buf[1] = mag_buf[0] + mapstep*cn; + mag_buf[2] = mag_buf[1] + mapstep*cn; + memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32)); + + map = (u8*)(mag_buf[2] + mapstep*cn); + memset(map, 1, mapstep); + memset(map + mapstep*(size.height + 1), 1, mapstep); + } + inline void firstRow(const Size2D &size, s32 cn, + const u8 *, ptrdiff_t, + s16* dxBase, ptrdiff_t dxStride, + s16* dyBase, ptrdiff_t dyStride, + s32** mag_buf) + { + s32* _norm = mag_buf[1] + 1; + + s16* _dx = internal::getRowPtr(dxBase, dxStride, 0); + s16* _dy = internal::getRowPtr(dyBase, dyStride, 0); + + NormCanny(size.width*cn, _dx, _dy, _norm); + + if(cn > 1) + { + for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn) + { + size_t maxIdx = jn; + for(s32 k = 1; k < cn; ++k) + if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k; + _norm[j] = _norm[maxIdx]; + _dx[j] = _dx[maxIdx]; + _dy[j] = _dy[maxIdx]; + } + } + + _norm[-1] = _norm[size.width] = 0; + } + inline void nextRow(const Size2D &size, s32 cn, + const u8 *, ptrdiff_t, + s16* dxBase, ptrdiff_t dxStride, + s16* dyBase, ptrdiff_t dyStride, + const ptrdiff_t &mapstep, s32** mag_buf, + size_t i, const s16* &_x, const s16* &_y) + { + s32* _norm = mag_buf[(i > 0) + 1] + 1; + if (i < size.height) + { + s16* _dx = internal::getRowPtr(dxBase, dxStride, i); + s16* _dy = internal::getRowPtr(dyBase, dyStride, i); + + NormCanny(size.width*cn, _dx, _dy, _norm); + + if(cn > 1) + { + for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn) + { + size_t maxIdx = jn; + for(s32 k = 1; k < cn; ++k) + if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k; + _norm[j] = _norm[maxIdx]; + _dx[j] = _dx[maxIdx]; + _dy[j] = _dy[maxIdx]; + } + } + + _norm[-1] = _norm[size.width] = 0; + } + else + memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32)); + + _x = internal::getRowPtr(dxBase, dxStride, i-1); + _y = internal::getRowPtr(dyBase, dyStride, i-1); + } +}; + +template +inline void Canny3x3(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin) +{ + s32 low, high; + prepareThresh(low_thresh, high_thresh, low, high); + + ptrdiff_t mapstep; + s32* mag_buf[3]; + u8* map; + _normEstimator normEstimator(size, cn, borderMargin, mapstep, mag_buf, map); + + size_t maxsize = std::max( 1u << 10, size.width * size.height / 10 ); + std::vector stack( maxsize ); + u8 **stack_top = &stack[0]; + u8 **stack_bottom = &stack[0]; + + /* sector numbers + (Top-Left Origin) + + 1 2 3 + * * * + * * * + 0*******0 + * * * + * * * + 3 2 1 + */ + + #define CANNY_PUSH(d) *(d) = u8(2), *stack_top++ = (d) + #define CANNY_POP(d) (d) = *--stack_top + + //i == 0 + normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf); + // calculate magnitude and angle of gradient, perform non-maxima supression. + // fill the map with one of the following values: + // 0 - the pixel might belong to an edge + // 1 - the pixel can not belong to an edge + // 2 - the pixel does belong to an edge + for (size_t i = 1; i <= size.height; i++) + { + const s16 *_x, *_y; + normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y); + + u8* _map = map + mapstep*i + 1; + _map[-1] = _map[size.width] = 1; + + s32* _mag = mag_buf[1] + 1; // take the central row + ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1]; + ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1]; + + if ((stack_top - stack_bottom) + size.width > maxsize) + { + ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom); + maxsize = maxsize * 3/2; + stack.resize(maxsize); + stack_bottom = &stack[0]; + stack_top = stack_bottom + sz; + } + + s32 prev_flag = 0; + for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++) + { + #define CANNY_SHIFT 15 + const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1< low) + { + s32 xs = _x[j]; + s32 ys = _y[j]; + s32 x = abs(xs); + s32 y = abs(ys) << CANNY_SHIFT; + + s32 tg22x = x * TG22; + + if (y < tg22x) + { + if (m > _mag[j-1] && m >= _mag[j+1]) goto __push; + } + else + { + s32 tg67x = tg22x + (x << (CANNY_SHIFT+1)); + if (y > tg67x) + { + if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push; + } + else + { + s32 s = (xs ^ ys) < 0 ? -1 : 1; + if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push; + } + } + } + prev_flag = 0; + _map[j] = u8(1); + continue; + __push: + if (!prev_flag && m > high && _map[j-mapstep] != 2) + { + CANNY_PUSH(_map + j); + prev_flag = 1; + } + else + _map[j] = 0; + } + + // scroll the ring buffer + _mag = mag_buf[0]; + mag_buf[0] = mag_buf[1]; + mag_buf[1] = mag_buf[2]; + mag_buf[2] = _mag; + } + + // now track the edges (hysteresis thresholding) + while (stack_top > stack_bottom) + { + u8* m; + if ((size_t)(stack_top - stack_bottom) + 8u > maxsize) + { + ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom); + maxsize = maxsize * 3/2; + stack.resize(maxsize); + stack_bottom = &stack[0]; + stack_top = stack_bottom + sz; + } + + CANNY_POP(m); + + if (!m[-1]) CANNY_PUSH(m - 1); + if (!m[1]) CANNY_PUSH(m + 1); + if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1); + if (!m[-mapstep]) CANNY_PUSH(m - mapstep); + if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1); + if (!m[mapstep-1]) CANNY_PUSH(m + mapstep - 1); + if (!m[mapstep]) CANNY_PUSH(m + mapstep); + if (!m[mapstep+1]) CANNY_PUSH(m + mapstep + 1); + } + + // the final pass, form the final image + uint8x16_t v2 = vmovq_n_u8(2); + const u8* ptrmap = map + mapstep + 1; + for (size_t i = 0; i < size.height; i++, ptrmap += mapstep) + { + u8* _dst = internal::getRowPtr(dstBase, dstStride, i); + ptrdiff_t j = 0; + for (; j < (ptrdiff_t)size.width - 16; j += 16) + { + internal::prefetch(ptrmap); + uint8x16_t vmap = vld1q_u8(ptrmap + j); + uint8x16_t vdst = vceqq_u8(vmap, v2); + vst1q_u8(_dst+j, vdst); + } + for (; j < (ptrdiff_t)size.width; j++) + _dst[j] = (u8)-(ptrmap[j] >> 1); + } +} + +} // namespace +#endif + +bool isCanny3x3Supported(const Size2D &size) +{ + return isSupportedConfiguration() && + size.height >= 2 && size.width >= 9; +} + +void Canny3x3L1(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin) +{ + internal::assertSupportedConfiguration(isCanny3x3Supported(size)); +#ifdef CAROTENE_NEON + Canny3x3(size, 1, + srcBase, srcStride, + dstBase, dstStride, + NULL, 0, + NULL, 0, + low_thresh, high_thresh, + borderMargin); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)low_thresh; + (void)high_thresh; + (void)borderMargin; +#endif +} + +void Canny3x3L2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin) +{ + internal::assertSupportedConfiguration(isCanny3x3Supported(size)); +#ifdef CAROTENE_NEON + Canny3x3(size, 1, + srcBase, srcStride, + dstBase, dstStride, + NULL, 0, + NULL, 0, + low_thresh, high_thresh, + borderMargin); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)low_thresh; + (void)high_thresh; + (void)borderMargin; +#endif +} + +void Canny3x3L1(const Size2D &size, s32 cn, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Canny3x3(size, cn, + NULL, 0, + dstBase, dstStride, + dxBase, dxStride, + dyBase, dyStride, + low_thresh, high_thresh, + Margin()); +#else + (void)size; + (void)cn; + (void)dstBase; + (void)dstStride; + (void)dxBase; + (void)dxStride; + (void)dyBase; + (void)dyStride; + (void)low_thresh; + (void)high_thresh; +#endif +} + +void Canny3x3L2(const Size2D &size, s32 cn, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Canny3x3(size, cn, + NULL, 0, + dstBase, dstStride, + dxBase, dxStride, + dyBase, dyStride, + low_thresh, high_thresh, + Margin()); +#else + (void)size; + (void)cn; + (void)dstBase; + (void)dstStride; + (void)dxBase; + (void)dxStride; + (void)dyBase; + (void)dyStride; + (void)low_thresh; + (void)high_thresh; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/channel_extract.cpp b/3rdparty/carotene/src/channel_extract.cpp new file mode 100644 index 0000000000..fda8f6e153 --- /dev/null +++ b/3rdparty/carotene/src/channel_extract.cpp @@ -0,0 +1,486 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +void extract2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; dj < roiw32; sj += 64, dj += 32) + { + internal::prefetch(src + sj); + + uint8x16x2_t v_src = vld2q_u8(src + sj); + vst1q_u8(dst + dj, v_src.val[coi]); + + v_src = vld2q_u8(src + sj + 32); + vst1q_u8(dst + dj + 16, v_src.val[coi]); + } +#endif + + for (; dj < roiw8; sj += 16, dj += 8) + { + uint8x8x2_t v_src = vld2_u8(src + sj); + vst1_u8(dst + dj, v_src.val[coi]); + } + + for (; dj < size.width; sj += 2, ++dj) + { + dst[dj] = src[sj + coi]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)coi; +#endif +} + +void extract3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; dj < roiw32; sj += 96, dj += 32) + { + internal::prefetch(src + sj); + + uint8x16x3_t v_src = vld3q_u8(src + sj); + vst1q_u8(dst + dj, v_src.val[coi]); + + v_src = vld3q_u8(src + sj + 48); + vst1q_u8(dst + dj + 16, v_src.val[coi]); + } +#endif + + for (; dj < roiw8; sj += 24, dj += 8) + { + uint8x8x3_t v_src = vld3_u8(src + sj); + vst1_u8(dst + dj, v_src.val[coi]); + } + + for (; dj < size.width; sj += 3, ++dj) + { + dst[dj] = src[sj + coi]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)coi; +#endif +} + +void extract4(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; dj < roiw32; sj += 128, dj += 32) + { + internal::prefetch(src + sj); + + uint8x16x4_t v_src = vld4q_u8(src + sj); + vst1q_u8(dst + dj, v_src.val[coi]); + + v_src = vld4q_u8(src + sj + 64); + vst1q_u8(dst + dj + 16, v_src.val[coi]); + } +#endif + + for (; dj < roiw8; sj += 32, dj += 8) + { + uint8x8x4_t v_src = vld4_u8(src + sj); + vst1_u8(dst + dj, v_src.val[coi]); + } + + for (; dj < size.width; sj += 4, ++dj) + { + dst[dj] = src[sj + coi]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)coi; +#endif +} + +#define FILL_LINES2(macro,type) \ + macro##_LINE(type,0) \ + macro##_LINE(type,1) +#define FILL_LINES3(macro,type) \ + FILL_LINES2(macro,type) \ + macro##_LINE(type,2) +#define FILL_LINES4(macro,type) \ + FILL_LINES3(macro,type) \ + macro##_LINE(type,3) + +#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride + +#ifdef CAROTENE_NEON + +#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i); +#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]); +#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]); +#define SST_LINE(type, n) dst##n[dj] = src[sj + n]; + +#define MUL2(val) (val << 1) +#define MUL3(val) (MUL2(val) + val) +#define MUL4(val) (val << 2) + +#define CONTDST2 srcStride == dst0Stride && \ + srcStride == dst1Stride && +#define CONTDST3 srcStride == dst0Stride && \ + srcStride == dst1Stride && \ + srcStride == dst2Stride && +#define CONTDST4 srcStride == dst0Stride && \ + srcStride == dst1Stride && \ + srcStride == dst2Stride && \ + srcStride == dst3Stride && + +#if __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define SPLIT_ASM2(sgn, bits) __asm__ ( \ + "vld2." #bits " {d0, d2}, [%[in0]] \n\t" \ + "vld2." #bits " {d1, d3}, [%[in1]] \n\t" \ + "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \ + "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \ + : \ + : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \ + [in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3" \ + ); +#define SPLIT_ASM3(sgn, bits) __asm__ ( \ + "vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \ + "vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \ + "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \ + "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \ + "vst1." #bits " {d4-d5}, [%[out2]] \n\t" \ + : \ + : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \ + [in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5" \ + ); +#define SPLIT_ASM4(sgn, bits) __asm__ ( \ + "vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \ + "vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \ + "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \ + "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \ + "vst1." #bits " {d4-d5}, [%[out2]] \n\t" \ + "vst1." #bits " {d6-d7}, [%[out3]] \n\t" \ + : \ + : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \ + [in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5","d6","d7" \ + ); + +#define SPLIT_QUAD(sgn, bits, n) { \ + internal::prefetch(src + sj); \ + SPLIT_ASM##n(sgn, bits) \ + } + +#else + +#define SPLIT_QUAD(sgn, bits, n) { \ + internal::prefetch(src + sj); \ + vec128 v_src = vld##n##q_##sgn##bits(src + sj); \ + FILL_LINES##n(VST1Q, sgn##bits) \ + } + +#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \ + const sgn##bits * srcBase, ptrdiff_t srcStride \ + FILL_LINES##n(FARG, sgn##bits) ) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (CONTDST##n \ + dst0Stride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec128 vec128; \ + size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \ + typedef internal::VecTraits::vec64 vec64; \ + size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \ + FILL_LINES##n(VROW, sgn##bits) \ + size_t sj = 0u, dj = 0u; \ + \ + for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \ + SPLIT_QUAD(sgn, bits, n) \ + \ + if (dj < roiw8) \ + { \ + vec64 v_src = vld##n##_##sgn##bits(src + sj); \ + FILL_LINES##n(VST1, sgn##bits) \ + sj += MUL##n(8)/sizeof(sgn##bits); \ + dj += 8/sizeof(sgn##bits); \ + } \ + \ + for (; dj < size.width; sj += n, ++dj) \ + { \ + FILL_LINES##n(SST, sgn##bits) \ + } \ + } \ +} + +#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \ + const sgn##64 * srcBase, ptrdiff_t srcStride \ + FILL_LINES##n(FARG, sgn##64) ) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (CONTDST##n \ + dst0Stride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec64 vec64; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \ + FILL_LINES##n(VROW, sgn##64) \ + size_t sj = 0u, dj = 0u; \ + \ + for (; dj < size.width; sj += n, ++dj) \ + { \ + vec64 v_src = vld##n##_##sgn##64(src + sj); \ + FILL_LINES##n(VST1, sgn##64) \ + } \ + } \ +} + +#if __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define ALPHA_QUAD(sgn, bits) { \ + internal::prefetch(src + sj); \ + __asm__ ( \ + "vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \ + "vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \ + "vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \ + "vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \ + "vst1." #bits " {d6-d7}, [%[out1]] \n\t" \ + : \ + : [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \ + [in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5","d6","d7" \ + ); \ + } + +#else + +#define ALPHA_QUAD(sgn, bits) { \ + internal::prefetch(src + sj); \ + union { vec128_4 v4; vec128_3 v3; } vals; \ + vals.v4 = vld4q_##sgn##bits(src + sj); \ + vst3q_##sgn##bits(dst3 + d3j, vals.v3); \ + vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \ + } + +#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \ + const sgn##bits * srcBase, ptrdiff_t srcStride, \ + sgn##bits * dst3Base, ptrdiff_t dst3Stride, \ + sgn##bits * dst1Base, ptrdiff_t dst1Stride) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (srcStride == dst3Stride && \ + srcStride == dst1Stride && \ + srcStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec128 vec128_4; \ + typedef internal::VecTraits::vec128 vec128_3; \ + size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \ + typedef internal::VecTraits::vec64 vec64_4; \ + typedef internal::VecTraits::vec64 vec64_3; \ + size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \ + sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \ + sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \ + size_t sj = 0u, d3j = 0u, d1j = 0u; \ + \ + for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \ + d1j += 16/sizeof(sgn##bits)) \ + ALPHA_QUAD(sgn, bits) \ + \ + if (d1j < roiw8) \ + { \ + union { vec64_4 v4; vec64_3 v3; } vals; \ + vals.v4 = vld4_##sgn##bits(src + sj); \ + vst3_u8(dst3 + d3j, vals.v3); \ + vst1_u8(dst1 + d1j, vals.v4.val[3]); \ + sj += MUL4(8)/sizeof(sgn##bits); \ + d3j += MUL3(8)/sizeof(sgn##bits); \ + d1j += 8/sizeof(sgn##bits); \ + } \ + \ + for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \ + { \ + dst3[d3j+0] = src[sj + 0]; \ + dst3[d3j+1] = src[sj + 1]; \ + dst3[d3j+2] = src[sj + 2]; \ + dst1[d1j] = src[sj + 3]; \ + } \ + } \ +} + +#else + +#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride; + +#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \ + const sgn##bits * srcBase, ptrdiff_t srcStride \ + FILL_LINES##n(FARG, sgn##bits) ) \ +{ \ + internal::assertSupportedConfiguration(); \ + (void)size; \ + (void)srcBase; \ + (void)srcStride; \ + FILL_LINES##n(VOID, sgn##bits) \ +} + +#define SPLIT64(sgn,n) SPLIT(sgn,64,n) + +#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \ + const sgn##bits * srcBase, ptrdiff_t srcStride, \ + sgn##bits * dst3Base, ptrdiff_t dst3Stride, \ + sgn##bits * dst1Base, ptrdiff_t dst1Stride) \ +{ \ + internal::assertSupportedConfiguration(); \ + (void)size; \ + (void)srcBase; \ + (void)srcStride; \ + (void)dst3Base; \ + (void)dst3Stride; \ + (void)dst1Base; \ + (void)dst1Stride; \ +} + +#endif //CAROTENE_NEON + +SPLIT(u, 8,2) +SPLIT(u, 8,3) +SPLIT(u, 8,4) +SPLIT(u,16,2) +SPLIT(u,16,3) +SPLIT(u,16,4) +SPLIT(s,32,2) +SPLIT(s,32,3) +SPLIT(s,32,4) + +SPLIT64(s, 2) +SPLIT64(s, 3) +SPLIT64(s, 4) + +SPLIT4ALPHA(u,8) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/channels_combine.cpp b/3rdparty/carotene/src/channels_combine.cpp new file mode 100644 index 0000000000..32b71470e2 --- /dev/null +++ b/3rdparty/carotene/src/channels_combine.cpp @@ -0,0 +1,389 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#define FILL_LINES2(macro,type) \ + macro##_LINE(type,0) \ + macro##_LINE(type,1) +#define FILL_LINES3(macro,type) \ + FILL_LINES2(macro,type) \ + macro##_LINE(type,2) +#define FILL_LINES4(macro,type) \ + FILL_LINES3(macro,type) \ + macro##_LINE(type,3) + +#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride + +#ifdef CAROTENE_NEON + +#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i); +#define PREF_LINE(type, n) internal::prefetch(src##n + sj); +#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj); +#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj); +#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj); +#define SLD_LINE(type, n) dst[dj + n] = src##n[sj]; + +#define MUL2(val) (val << 1) +#define MUL3(val) (MUL2(val) + val) +#define MUL4(val) (val << 2) + +#define CONTSRC2 dstStride == src0Stride && \ + dstStride == src1Stride && +#define CONTSRC3 dstStride == src0Stride && \ + dstStride == src1Stride && \ + dstStride == src2Stride && +#define CONTSRC4 dstStride == src0Stride && \ + dstStride == src1Stride && \ + dstStride == src2Stride && \ + dstStride == src3Stride && + +#if __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define MERGE_ASM2(sgn, bits) __asm__ ( \ + "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ + "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ + "vst2." #bits " {d0, d2}, [%[out0]] \n\t" \ + "vst2." #bits " {d1, d3}, [%[out1]] \n\t" \ + : \ + : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \ + [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3" \ + ); +#define MERGE_ASM3(sgn, bits) __asm__ ( \ + "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ + "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ + "vld1." #bits " {d4-d5}, [%[in2]] \n\t" \ + "vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \ + "vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \ + : \ + : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \ + [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5" \ + ); +#define MERGE_ASM4(sgn, bits) __asm__ ( \ + "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ + "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ + "vld1." #bits " {d4-d5}, [%[in2]] \n\t" \ + "vld1." #bits " {d6-d7}, [%[in3]] \n\t" \ + "vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \ + "vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \ + : \ + : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \ + [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5","d6","d7" \ + ); + +#define MERGE_QUAD(sgn, bits, n) { \ + FILL_LINES##n(PREF, sgn##bits) \ + MERGE_ASM##n(sgn, bits) \ + } + +#else + +#define MERGE_QUAD(sgn, bits, n) { \ + vec128 v_dst; \ + /*FILL_LINES##n(PREF, sgn##bits) \ + FILL_LINES##n(VLD1Q, sgn##bits)*/ \ + FILL_LINES##n(PRLD, sgn##bits) \ + vst##n##q_##sgn##bits(dst + dj, v_dst); \ + } + +#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \ + FILL_LINES##n(FARG, sgn##bits), \ + sgn##bits * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (CONTSRC##n \ + dstStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec128 vec128; \ + size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \ + typedef internal::VecTraits::vec64 vec64; \ + size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + FILL_LINES##n(VROW, sgn##bits) \ + sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \ + size_t sj = 0u, dj = 0u; \ + \ + for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \ + MERGE_QUAD(sgn, bits, n) \ + \ + if ( sj < roiw8 ) \ + { \ + vec64 v_dst; \ + FILL_LINES##n(VLD1, sgn##bits) \ + vst##n##_##sgn##bits(dst + dj, v_dst); \ + sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \ + } \ + \ + for (; sj < size.width; ++sj, dj += n) \ + { \ + FILL_LINES##n(SLD, sgn##bits) \ + } \ + } \ +} + +#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \ + FILL_LINES##n(FARG, sgn##64), \ + sgn##64 * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (CONTSRC##n \ + dstStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec64 vec64; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + FILL_LINES##n(VROW, sgn##64) \ + sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \ + size_t sj = 0u, dj = 0u; \ + \ + for (; sj < size.width; ++sj, dj += n) \ + { \ + vec64 v_dst; \ + FILL_LINES##n(VLD1, sgn##64) \ + vst##n##_##sgn##64(dst + dj, v_dst); \ + /*FILL_LINES##n(SLD, sgn##64)*/ \ + } \ + } \ +} + +#else + +#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride; + +#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \ + FILL_LINES##n(FARG, sgn##bits), \ + sgn##bits * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + (void)size; \ + FILL_LINES##n(VOID, sgn##bits) \ + (void)dstBase; \ + (void)dstStride; \ +} +#define COMBINE64(sgn,n) COMBINE(sgn,64,n) + +#endif //CAROTENE_NEON + +COMBINE(u, 8,2) +COMBINE(u, 8,3) +COMBINE(u, 8,4) +COMBINE(u,16,2) +COMBINE(u,16,3) +COMBINE(u,16,4) +COMBINE(s,32,2) +COMBINE(s,32,3) +COMBINE(s,32,4) +COMBINE64(s, 2) +COMBINE64(s, 3) +COMBINE64(s, 4) + +void combineYUYV(const Size2D &size, + const u8 * srcyBase, ptrdiff_t srcyStride, + const u8 * srcuBase, ptrdiff_t srcuStride, + const u8 * srcvBase, ptrdiff_t srcvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; i += 1) + { + const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); + const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); + const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t syj = 0u, sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; sj < roiw32; sj += 32, syj += 64, dj += 128) + { + internal::prefetch(srcy + syj); + internal::prefetch(srcu + sj); + internal::prefetch(srcv + sj); + + uint8x16x2_t v_y = vld2q_u8(srcy + syj); + uint8x16x4_t v_dst; + v_dst.val[0] = v_y.val[0]; + v_dst.val[1] = vld1q_u8(srcu + sj); + v_dst.val[2] = v_y.val[1]; + v_dst.val[3] = vld1q_u8(srcv + sj); + vst4q_u8(dst + dj, v_dst); + + v_y = vld2q_u8(srcy + syj + 32); + v_dst.val[0] = v_y.val[0]; + v_dst.val[1] = vld1q_u8(srcu + sj + 16); + v_dst.val[2] = v_y.val[1]; + v_dst.val[3] = vld1q_u8(srcv + sj + 16); + vst4q_u8(dst + dj + 64, v_dst); + } +#endif + + for (; sj < roiw8; sj += 8, syj += 16, dj += 32) + { + uint8x8x2_t v_y = vld2_u8(srcy + syj); + uint8x8x4_t v_dst; + v_dst.val[0] = v_y.val[0]; + v_dst.val[1] = vld1_u8(srcu + sj); + v_dst.val[2] = v_y.val[1]; + v_dst.val[3] = vld1_u8(srcv + sj); + vst4_u8(dst + dj, v_dst); + } + + for (; sj < size.width; ++sj, syj += 2, dj += 4) + { + dst[dj] = srcy[syj]; + dst[dj + 1] = srcu[sj]; + dst[dj + 2] = srcy[syj + 1]; + dst[dj + 3] = srcv[sj]; + } + } +#else + (void)size; + (void)srcyBase; + (void)srcyStride; + (void)srcuBase; + (void)srcuStride; + (void)srcvBase; + (void)srcvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void combineUYVY(const Size2D &size, + const u8 * srcyBase, ptrdiff_t srcyStride, + const u8 * srcuBase, ptrdiff_t srcuStride, + const u8 * srcvBase, ptrdiff_t srcvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); + const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); + const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t syj = 0u, sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; sj < roiw32; sj += 32, syj += 64, dj += 128) + { + internal::prefetch(srcy + syj); + internal::prefetch(srcu + sj); + internal::prefetch(srcv + sj); + + uint8x16x2_t v_y = vld2q_u8(srcy + syj); + uint8x16x4_t v_dst; + v_dst.val[0] = vld1q_u8(srcu + sj); + v_dst.val[1] = v_y.val[0]; + v_dst.val[2] = vld1q_u8(srcv + sj); + v_dst.val[3] = v_y.val[1]; + vst4q_u8(dst + dj, v_dst); + + v_y = vld2q_u8(srcy + syj + 32); + v_dst.val[0] = vld1q_u8(srcu + sj + 16); + v_dst.val[1] = v_y.val[0]; + v_dst.val[2] = vld1q_u8(srcv + sj + 16); + v_dst.val[3] = v_y.val[1]; + vst4q_u8(dst + dj + 64, v_dst); + } +#endif + + for (; sj < roiw8; sj += 8, syj += 16, dj += 32) + { + uint8x8x2_t v_y = vld2_u8(srcy + syj); + uint8x8x4_t v_dst; + v_dst.val[0] = vld1_u8(srcu + sj); + v_dst.val[1] = v_y.val[0]; + v_dst.val[2] = vld1_u8(srcv + sj); + v_dst.val[3] = v_y.val[1]; + vst4_u8(dst + dj, v_dst); + } + + for (; sj < size.width; ++sj, syj += 2, dj += 4) + { + dst[dj] = srcu[sj]; + dst[dj + 1] = srcy[syj]; + dst[dj + 2] = srcv[sj]; + dst[dj + 3] = srcy[syj + 1]; + } + } +#else + (void)size; + (void)srcyBase; + (void)srcyStride; + (void)srcuBase; + (void)srcuStride; + (void)srcvBase; + (void)srcvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/cmp.cpp b/3rdparty/carotene/src/cmp.cpp new file mode 100644 index 0000000000..eda121985e --- /dev/null +++ b/3rdparty/carotene/src/cmp.cpp @@ -0,0 +1,340 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); } +inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); } +inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); } + +template struct vtail +{ + static inline void compare(const typename Op::type * src0, const typename Op::type * src1, + u8 * dst, const Op & op, + size_t &x, size_t width) + { + //do nothing since there couldn't be enough data + (void)src0; + (void)src1; + (void)dst; + (void)op; + (void)x; + (void)width; + } +}; +template struct vtail +{ + static inline void compare(const typename Op::type * src0, const typename Op::type * src1, + u8 * dst, const Op & op, + size_t &x, size_t width) + { + typedef typename Op::type type; + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + //There no more than 15 elements in the tail, so we could handle 8 element vector only once + if( x + 8 < width) + { + vec128 v_src0, v_src1; + uvec128 v_dst; + + v_src0 = internal::vld1q(src0 + x); + v_src1 = internal::vld1q(src1 + x); + op(v_src0, v_src1, v_dst); + internal::vst1(dst + x, internal::vmovn(v_dst)); + x+=8; + } + } +}; +template struct vtail +{ + static inline void compare(const typename Op::type * src0, const typename Op::type * src1, + u8 * dst, const Op & op, + size_t &x, size_t width) + { + typedef typename Op::type type; + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + typedef typename internal::VecTraits::vec64 vec64; + typedef typename internal::VecTraits::unsign::vec64 uvec64; + //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements + if( x + 16 < width) + { + vec128 v_src0, v_src1; + uvec128 v_dst; + + v_src0 = internal::vld1q(src0 + x); + v_src1 = internal::vld1q(src1 + x); + op(v_src0, v_src1, v_dst); + internal::vst1q(dst + x, v_dst); + x+=16; + } + if( x + 8 < width) + { + vec64 v_src0, v_src1; + uvec64 v_dst; + + v_src0 = internal::vld1(src0 + x); + v_src1 = internal::vld1(src1 + x); + op(v_src0, v_src1, v_dst); + internal::vst1(dst + x, v_dst); + x+=8; + } + } +}; + +template +void vcompare(Size2D size, + const typename Op::type * src0Base, ptrdiff_t src0Stride, + const typename Op::type * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, const Op & op) +{ + typedef typename Op::type type; + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + + if (src0Stride == src1Stride && src0Stride == dstStride && + src0Stride == (ptrdiff_t)(size.width * sizeof(type))) + { + size.width *= size.height; + size.height = 1; + } + + const u32 step_base = 32 / sizeof(type); + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; + + for (size_t y = 0; y < size.height; ++y) + { + const type * src0 = internal::getRowPtr(src0Base, src0Stride, y); + const type * src1 = internal::getRowPtr(src1Base, src1Stride, y); + u8 * dst = internal::getRowPtr(dstBase, dstStride, y); + size_t x = 0; + + for( ; x < roiw_base; x += step_base ) + { + internal::prefetch(src0 + x); + internal::prefetch(src1 + x); + + vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type)); + vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type)); + uvec128 v_dst0; + uvec128 v_dst1; + + op(v_src00, v_src10, v_dst0); + op(v_src01, v_src11, v_dst1); + + vnst(dst + x, v_dst0, v_dst1); + } + + vtail::compare(src0, src1, dst, op, x, size.width); + + for (; x < size.width; ++x) + { + op(src0 + x, src1 + x, dst + x); + } + } +} + +template +struct OpCmpEQ +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::unsign::vec128 & v_dst) const + { + v_dst = internal::vceqq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::unsign::vec64 & v_dst) const + { + v_dst = internal::vceq(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, u8 * dst) const + { + dst[0] = src0[0] == src1[0] ? 255 : 0; + } +}; + +template +struct OpCmpNE +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::unsign::vec128 & v_dst) const + { + v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1)); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::unsign::vec64 & v_dst) const + { + v_dst = internal::vmvn(internal::vceq(v_src0, v_src1)); + } + + void operator() (const T * src0, const T * src1, u8 * dst) const + { + dst[0] = src0[0] == src1[0] ? 0 : 255; + } +}; + +template +struct OpCmpGT +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::unsign::vec128 & v_dst) const + { + v_dst = internal::vcgtq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::unsign::vec64 & v_dst) const + { + v_dst = internal::vcgt(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, u8 * dst) const + { + dst[0] = src0[0] > src1[0] ? 255 : 0; + } +}; + +template +struct OpCmpGE +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::unsign::vec128 & v_dst) const + { + v_dst = internal::vcgeq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::unsign::vec64 & v_dst) const + { + v_dst = internal::vcge(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, u8 * dst) const + { + dst[0] = src0[0] >= src1[0] ? 255 : 0; + } +}; + +} + +#define IMPL_CMPOP(op, type) \ +void cmp##op(const Size2D &size, \ + const type * src0Base, ptrdiff_t src0Stride, \ + const type * src1Base, ptrdiff_t src1Stride, \ + u8 *dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + vcompare(size, \ + src0Base, src0Stride, \ + src1Base, src1Stride, \ + dstBase, dstStride, \ + OpCmp##op()); \ +} + +#else + +#define IMPL_CMPOP(op, type) \ +void cmp##op(const Size2D &size, \ + const type * src0Base, ptrdiff_t src0Stride, \ + const type * src1Base, ptrdiff_t src1Stride, \ + u8 *dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + (void)size; \ + (void)src0Base; \ + (void)src0Stride; \ + (void)src1Base; \ + (void)src1Stride; \ + (void)dstBase; \ + (void)dstStride; \ +} + +#endif + +IMPL_CMPOP(EQ, u8) +IMPL_CMPOP(EQ, s8) +IMPL_CMPOP(EQ, u16) +IMPL_CMPOP(EQ, s16) +IMPL_CMPOP(EQ, u32) +IMPL_CMPOP(EQ, s32) +IMPL_CMPOP(EQ, f32) + +IMPL_CMPOP(NE, u8) +IMPL_CMPOP(NE, s8) +IMPL_CMPOP(NE, u16) +IMPL_CMPOP(NE, s16) +IMPL_CMPOP(NE, u32) +IMPL_CMPOP(NE, s32) +IMPL_CMPOP(NE, f32) + +IMPL_CMPOP(GT, u8) +IMPL_CMPOP(GT, s8) +IMPL_CMPOP(GT, u16) +IMPL_CMPOP(GT, s16) +IMPL_CMPOP(GT, u32) +IMPL_CMPOP(GT, s32) +IMPL_CMPOP(GT, f32) + +IMPL_CMPOP(GE, u8) +IMPL_CMPOP(GE, s8) +IMPL_CMPOP(GE, u16) +IMPL_CMPOP(GE, s16) +IMPL_CMPOP(GE, u32) +IMPL_CMPOP(GE, s32) +IMPL_CMPOP(GE, f32) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/colorconvert.cpp b/3rdparty/carotene/src/colorconvert.cpp new file mode 100644 index 0000000000..ea2db6043a --- /dev/null +++ b/3rdparty/carotene/src/colorconvert.cpp @@ -0,0 +1,2846 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "saturate_cast.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +enum +{ + SHIFT = 14, + SHIFT_DELTA = 1 << (SHIFT - 1), + + R2Y_BT601 = 4899, + G2Y_BT601 = 9617, + B2Y_BT601 = 1868, + + R2Y_BT709 = 3483, + G2Y_BT709 = 11718, + B2Y_BT709 = 1183, +}; + +inline uint8x8_t convertToGray(const uint16x8_t & v_r, + const uint16x8_t & v_g, + const uint16x8_t & v_b, + const uint16x4_t & v_r2y, + const uint16x4_t & v_g2y, + const uint16x4_t & v_b2y) +{ + uint32x4_t v_dst0 = vmull_u16(vget_low_u16(v_g), v_g2y); + uint32x4_t v_dst1 = vmull_u16(vget_high_u16(v_g), v_g2y); + + v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_r), v_r2y); + v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_r), v_r2y); + + v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_b), v_b2y); + v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_b), v_b2y); + + uint8x8_t v_gray = vqmovn_u16(vcombine_u16(vrshrn_n_u32(v_dst0, SHIFT), + vrshrn_n_u32(v_dst1, SHIFT))); + + return v_gray; +} + +} // namespace + +#endif + +void rgb2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709; + const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709; + const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y); + register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y); + register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y); +#else + uint16x4_t v_r2y = vdup_n_u16(R2Y), + v_g2y = vdup_n_u16(G2Y), + v_b2y = vdup_n_u16(B2Y); + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + for (; dj < roiw8; sj += 24, dj += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0-d2}, [%[in]] @RGB \n\t" + "vmovl.u8 q2, d0 @R (d4,d5) \n\t" + "vmovl.u8 q3, d1 @G (d6,d7) \n\t" + "vmovl.u8 q4, d2 @B (d8,d9) \n\t" + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" + "vqmovn.u16 d4, q4 \n\t" + "vst1.8 {d4}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +#else + for (; dj < roiw16; sj += 48, dj += 16) + { + internal::prefetch(src + sj); + uint8x16x3_t v_src0 = vld3q_u8(src + sj); + // 0 + uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_low_u8(v_src0.val[1])), + v_b = vmovl_u8(vget_low_u8(v_src0.val[2])); + uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + v_r = vmovl_u8(vget_high_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_high_u8(v_src0.val[1])), + v_b = vmovl_u8(vget_high_u8(v_src0.val[2])); + uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1)); + } + + if (dj < roiw8) + { + uint8x8x3_t v_src = vld3_u8(src + sj); + uint16x8_t v_r = vmovl_u8(v_src.val[0]), + v_g = vmovl_u8(v_src.val[1]), + v_b = vmovl_u8(v_src.val[2]); + uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1_u8(dst + dj, v_gray); + sj += 24; dj += 8; + } +#endif + + for (; dj < size.width; sj += 3, dj++) + { + u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y; + dst[dj] = internal::saturate_cast((val + SHIFT_DELTA) >> SHIFT); + } + } +#else + (void)size; + (void)color_space; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709; + const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709; + const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y); + register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y); + register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y); +#else + uint16x4_t v_r2y = vdup_n_u16(R2Y), + v_g2y = vdup_n_u16(G2Y), + v_b2y = vdup_n_u16(B2Y); + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + for (; dj < roiw8; sj += 32, dj += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0-d3}, [%[in]] @RGBA \n\t" + "vmovl.u8 q2, d0 @R (d4,d5) \n\t" + "vmovl.u8 q3, d1 @G (d6,d7) \n\t" + "vmovl.u8 q4, d2 @B (d8,d9) \n\t" + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" + "vqmovn.u16 d4, q4 \n\t" + "vst1.8 {d4}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +#else + for (; dj < roiw16; sj += 64, dj += 16) + { + internal::prefetch(src + sj); + uint8x16x4_t v_src0 = vld4q_u8(src + sj); + + // 0 + uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_low_u8(v_src0.val[1])), + v_b = vmovl_u8(vget_low_u8(v_src0.val[2])); + uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + v_r = vmovl_u8(vget_high_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_high_u8(v_src0.val[1])), + v_b = vmovl_u8(vget_high_u8(v_src0.val[2])); + uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1)); + } + + if (dj < roiw8) + { + uint8x8x4_t v_src = vld4_u8(src + sj); + uint16x8_t v_r = vmovl_u8(v_src.val[0]), + v_g = vmovl_u8(v_src.val[1]), + v_b = vmovl_u8(v_src.val[2]); + uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1_u8(dst + dj, v_gray); + sj += 32; dj += 8; + } +#endif + + for (; dj < size.width; sj += 4, dj++) + { + u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y; + dst[dj] = internal::saturate_cast((val + SHIFT_DELTA) >> SHIFT); + } + } +#else + (void)size; + (void)color_space; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bgr2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709; + const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709; + const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y); + register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y); + register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y); +#else + uint16x4_t v_r2y = vdup_n_u16(R2Y), + v_g2y = vdup_n_u16(G2Y), + v_b2y = vdup_n_u16(B2Y); + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + for (; dj < roiw8; sj += 24, dj += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0-d2}, [%[in]] @BGR \n\t" + "vmovl.u8 q2, d2 @R (d4,d5) \n\t" + "vmovl.u8 q3, d1 @G (d6,d7) \n\t" + "vmovl.u8 q4, d0 @B (d8,d9) \n\t" + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" + "vqmovn.u16 d4, q4 \n\t" + "vst1.8 {d4}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +#else + for (; dj < roiw16; sj += 48, dj += 16) + { + internal::prefetch(src + sj); + uint8x16x3_t v_src0 = vld3q_u8(src + sj); + + // 0 + uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_low_u8(v_src0.val[1])), + v_r = vmovl_u8(vget_low_u8(v_src0.val[2])); + uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + v_b = vmovl_u8(vget_high_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_high_u8(v_src0.val[1])), + v_r = vmovl_u8(vget_high_u8(v_src0.val[2])); + uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1)); + } + + if (dj < roiw8) + { + uint8x8x3_t v_src = vld3_u8(src + sj); + uint16x8_t v_b = vmovl_u8(v_src.val[0]), + v_g = vmovl_u8(v_src.val[1]), + v_r = vmovl_u8(v_src.val[2]); + uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1_u8(dst + dj, v_gray); + sj += 24; dj += 8; + } +#endif + + for (; dj < size.width; sj += 3, dj++) + { + u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y; + dst[dj] = internal::saturate_cast((val + SHIFT_DELTA) >> SHIFT); + } + } +#else + (void)size; + (void)color_space; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bgrx2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709; + const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709; + const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y); + register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y); + register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y); +#else + uint16x4_t v_r2y = vdup_n_u16(R2Y), + v_g2y = vdup_n_u16(G2Y), + v_b2y = vdup_n_u16(B2Y); + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + for (; dj < roiw8; sj += 32, dj += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0-d3}, [%[in]] @BGRA \n\t" + "vmovl.u8 q2, d2 @R (d4,d5) \n\t" + "vmovl.u8 q3, d1 @G (d6,d7) \n\t" + "vmovl.u8 q4, d0 @B (d8,d9) \n\t" + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" + "vqmovn.u16 d4, q4 \n\t" + "vst1.8 {d4}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +#else + for (; dj < roiw16; sj += 64, dj += 16) + { + internal::prefetch(src + sj); + uint8x16x4_t v_src0 = vld4q_u8(src + sj); + + // 0 + uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_low_u8(v_src0.val[1])), + v_r = vmovl_u8(vget_low_u8(v_src0.val[2])); + uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + v_b = vmovl_u8(vget_high_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_high_u8(v_src0.val[1])), + v_r = vmovl_u8(vget_high_u8(v_src0.val[2])); + uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1)); + } + + if (dj < roiw8) + { + uint8x8x4_t v_src = vld4_u8(src + sj); + uint16x8_t v_b = vmovl_u8(v_src.val[0]), + v_g = vmovl_u8(v_src.val[1]), + v_r = vmovl_u8(v_src.val[2]); + uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1_u8(dst + dj, v_gray); + sj += 32; dj += 8; + } +#endif + + for (; dj < size.width; sj += 4, dj++) + { + u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y; + dst[dj] = internal::saturate_cast((val + SHIFT_DELTA) >> SHIFT); + } + } +#else + (void)size; + (void)color_space; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gray2rgb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + + for (; sj < roiw16; sj += 16, dj += 48) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d0-d1}, [%[in0]] \n\t" + "vmov.8 q1, q0 \n\t" + "vmov.8 q2, q0 \n\t" + "vmov.8 q3, q1 \n\t" + "vst3.8 {d2, d4, d6}, [%[out0]] \n\t" + "vst3.8 {d3, d5, d7}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), [out1] "r" (dst + dj + 24), + [in0] "r" (src + sj) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); +#else + uint8x16x3_t vRgb1; + vRgb1.val[0] = vld1q_u8(src + sj); + + vRgb1.val[1] = vRgb1.val[0]; + vRgb1.val[2] = vRgb1.val[0]; + + vst3q_u8(dst + dj, vRgb1); +#endif + } + + if (sj < roiw8) + { +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d0}, [%[in]] \n\t" + "vmov.8 d1, d0 \n\t" + "vmov.8 d2, d0 \n\t" + "vst3.8 {d0-d2}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj) + : "d0","d1","d2" + ); +#else + uint8x8x3_t vRgb2; + vRgb2.val[0] = vld1_u8(src + sj); + vRgb2.val[1] = vRgb2.val[0]; + vRgb2.val[2] = vRgb2.val[0]; + + vst3_u8(dst + dj, vRgb2); +#endif + sj += 8; dj += 24; + } + + for (; sj < size.width; sj++, dj += 3) + { + dst[dj+0] = src[sj]; + dst[dj+1] = src[sj]; + dst[dj+2] = src[sj]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gray2rgbx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register uint8x16_t vc255 asm ("q4") = vmovq_n_u8(255); +#else + uint8x16x4_t vRgba; + uint8x8x4_t vRgba2; + vRgba.val[3] = vmovq_n_u8(255); + vRgba2.val[3] = vget_low_u8(vRgba.val[3]); +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + + for (; sj < roiw16; sj += 16, dj += 64) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d0-d1}, [%[in0]] \n\t" + "vmov.8 q1, q0 \n\t" + "vmov.8 q2, q0 \n\t" + "vmov.8 q3, q1 \n\t" + "vst4.8 {d2, d4, d6, d8}, [%[out0]] \n\t" + "vst4.8 {d3, d5, d7, d9}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), [out1] "r" (dst + dj + 32), + [in0] "r" (src + sj), + "w" (vc255) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); +#else + vRgba.val[0] = vld1q_u8(src + sj); + + vRgba.val[1] = vRgba.val[0]; + vRgba.val[2] = vRgba.val[0]; + + vst4q_u8(dst + dj, vRgba); +#endif + } + + if (sj < roiw8) + { +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d5}, [%[in]] \n\t" + "vmov.8 d6, d5 \n\t" + "vmov.8 d7, d5 \n\t" + "vst4.8 {d5-d8}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (vc255) + : "d5","d6","d7" + ); +#else + vRgba2.val[0] = vld1_u8(src + sj); + vRgba2.val[1] = vRgba2.val[0]; + vRgba2.val[2] = vRgba2.val[0]; + + vst4_u8(dst + dj, vRgba2); +#endif + sj += 8; dj += 32; + } + + for (; sj < size.width; sj++, dj += 4) + { + dst[dj+0] = src[sj]; + dst[dj+1] = src[sj]; + dst[dj+2] = src[sj]; + dst[dj+3] = 255; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2rgbx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; +#if defined(__GNUC__) && defined(__arm__) + register uint8x8_t vc255_0 asm ("d3") = vmov_n_u8(255); +#else + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0; + v_dst0.v4.val[3] = vdupq_n_u8(255); + union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst; + v_dst.v4.val[3] = vdup_n_u8(255); +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 24, dj += 32, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0, d1, d2}, [%[in0]] \n\t" + "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj), + "w" (vc255_0) + : "d0","d1","d2" + ); + } +#else + for (; j < roiw16; sj += 48, dj += 64, j += 16) + { + internal::prefetch(src + sj); + v_dst0.v3 = vld3q_u8(src + sj); + vst4q_u8(dst + dj, v_dst0.v4); + } + + if (j < roiw8) + { + v_dst.v3 = vld3_u8(src + sj); + vst4_u8(dst + dj, v_dst.v4); + sj += 24; dj += 32; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 3, dj += 4) + { + dst[dj] = src[sj]; + dst[dj + 1] = src[sj + 1]; + dst[dj + 2] = src[sj + 2]; + dst[dj + 3] = 255; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2rgb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0; + union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t" + "vst3.8 {d0, d1, d2}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj) + : "d0","d1","d2","d3" + ); + } +#else + for (; j < roiw16; sj += 64, dj += 48, j += 16) + { + internal::prefetch(src + sj); + v_dst0.v4 = vld4q_u8(src + sj); + vst3q_u8(dst + dj, v_dst0.v3); + } + + if (j < roiw8) + { + v_dst.v4 = vld4_u8(src + sj); + vst3_u8(dst + dj, v_dst.v3); + sj += 32; dj += 24; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + dst[dj] = src[sj]; + dst[dj + 1] = src[sj + 1]; + dst[dj + 2] = src[sj + 2]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2bgr(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0, d1, d2}, [%[in0]] \n\t" + "vswp d0, d2 \n\t" + "vst3.8 {d0, d1, d2}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj) + : "d0","d1","d2" + ); + } +#else + for (; j < roiw16; sj += 48, dj += 48, j += 16) + { + internal::prefetch(src + sj); + uint8x16x3_t vals0 = vld3q_u8(src + sj); + + std::swap(vals0.val[0], vals0.val[2]); + + vst3q_u8(dst + dj, vals0); + } + + if (j < roiw8) + { + uint8x8x3_t vals = vld3_u8(src + sj); + std::swap(vals.val[0], vals.val[2]); + vst3_u8(dst + dj, vals); + sj += 24; dj += 24; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + u8 b = src[sj + 2];//Handle src == dst case + dst[dj + 2] = src[sj ]; + dst[dj + 1] = src[sj + 1]; + dst[dj ] = b; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2bgrx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 32, dj += 32, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t" + "vswp d0, d2 \n\t" + "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj) + : "d0","d1","d2","d3" + ); + } +#else + for (; j < roiw16; sj += 64, dj += 64, j += 16) + { + internal::prefetch(src + sj); + uint8x16x4_t vals0 = vld4q_u8(src + sj); + + std::swap(vals0.val[0], vals0.val[2]); + + vst4q_u8(dst + dj, vals0); + } + + if (j < roiw8) + { + uint8x8x4_t vals = vld4_u8(src + sj); + std::swap(vals.val[0], vals.val[2]); + vst4_u8(dst + dj, vals); + sj += 32; dj += 32; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 4, dj += 4) + { + u8 b = src[sj + 2];//Handle src == dst case + dst[dj + 2] = src[sj ]; + dst[dj + 1] = src[sj + 1]; + dst[dj ] = b; + dst[dj + 3] = src[sj + 3]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2bgr(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t" + "vswp d0, d2 \n\t" + "vst3.8 {d0, d1, d2}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj) + : "d0","d1","d2","d3" + ); + } +#else + for (; j < roiw16; sj += 64, dj += 48, j += 16) + { + internal::prefetch(src + sj); + union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0; + vals0.v4 = vld4q_u8(src + sj); + std::swap(vals0.v3.val[0], vals0.v3.val[2]); + vst3q_u8(dst + dj, vals0.v3); + } + + if (j < roiw8) + { + union { uint8x8x4_t v4; uint8x8x3_t v3; } vals; + vals.v4 = vld4_u8(src + sj); + std::swap(vals.v3.val[0], vals.v3.val[2]); + vst3_u8(dst + dj, vals.v3); + sj += 32; dj += 24; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + dst[dj + 2] = src[sj ]; + dst[dj + 1] = src[sj + 1]; + dst[dj ] = src[sj + 2]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2bgrx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#if defined(__GNUC__) && defined(__arm__) + register uint8x8_t vc255 asm ("d3") = vmov_n_u8(255); +#else + union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0; + vals0.v4.val[3] = vmovq_n_u8(255); + union { uint8x8x4_t v4; uint8x8x3_t v3; } vals8; + vals8.v4.val[3] = vmov_n_u8(255); +#endif + +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 24, dj += 32, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0, d1, d2}, [%[in0]] \n\t" + "vswp d0, d2 \n\t" + "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj), + "w" (vc255) + : "d0","d1","d2" + ); + } +#else + for (; j < roiw16; sj += 48, dj += 64, j += 16) + { + internal::prefetch(src + sj); + vals0.v3 = vld3q_u8(src + sj); + std::swap(vals0.v4.val[0], vals0.v4.val[2]); + vst4q_u8(dst + dj, vals0.v4); + } + + if (j < roiw8) + { + vals8.v3 = vld3_u8(src + sj); + std::swap(vals8.v4.val[0], vals8.v4.val[2]); + vst4_u8(dst + dj, vals8.v4); + sj += 24; dj += 32; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 3, dj += 4) + { + dst[dj + 3] = 255; + dst[dj + 2] = src[sj ]; + dst[dj + 1] = src[sj + 1]; + dst[dj ] = src[sj + 2]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +namespace { + +#ifdef CAROTENE_NEON +inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const uint8x8_t vB, + const s32 hrange ) +{ + const s32 hsv_shift = 12; + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + + uint8x8_t vMin = vmin_u8(vR, vG); + uint8x8_t vMax = vmax_u8(vR, vG); + + uint16x8_t vR_u16 = vmovl_u8(vR); + uint16x8_t vG_u16 = vmovl_u8(vG); + + vMax = vmax_u8(vMax, vB); + vMin = vmin_u8(vMin, vB); + uint16x8_t vB_u16 = vmovl_u8(vB); + + uint16x8_t vDiff = vsubl_u8(vMax, vMin); + + uint16x8_t vV = vmovl_u8(vMax); + uint16x8_t vDiffx2 = vaddq_u16(vDiff, vDiff); + uint32x4_t vDiffL = vmovl_u16(vget_low_u16(vDiff)); + uint32x4_t vDiffH = vmovl_u16(vget_high_u16(vDiff)); + + uint16x8_t vVEqR = vceqq_u16(vR_u16, vV); + uint16x8_t vVEqG = vceqq_u16(vG_u16, vV); + + int16x8_t vG_B = vsubq_s16(vreinterpretq_s16_u16(vG_u16), vreinterpretq_s16_u16(vB_u16)); + uint16x8_t vInvR = vmvnq_u16(vVEqR); + int16x8_t vB_R = vsubq_s16(vreinterpretq_s16_u16(vB_u16), vreinterpretq_s16_u16(vR_u16)); + int16x8_t vR_G = vsubq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vG_u16)); + + uint16x8_t vMask2 = vandq_u16(vVEqG, vInvR); + vR_u16 = vandq_u16(vreinterpretq_u16_s16(vG_B), vVEqR); + int16x8_t vH2 = vaddq_s16(vB_R, vreinterpretq_s16_u16(vDiffx2)); + + vVEqR = vmvnq_u16(vVEqG); + vB_R = vaddq_s16(vreinterpretq_s16_u16(vDiffx2), vreinterpretq_s16_u16(vDiffx2)); + vG_B = vandq_s16(vreinterpretq_s16_u16(vInvR), vreinterpretq_s16_u16(vVEqR)); + vInvR = vandq_u16(vreinterpretq_u16_s16(vH2), vMask2); + vR_G = vaddq_s16(vR_G, vB_R); + int16x8_t vH = vaddq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vInvR)); + + uint32x4_t vV_L = vmovl_u16(vget_low_u16(vV)); + vR_G = vandq_s16(vR_G, vG_B); + uint32x4_t vV_H = vmovl_u16(vget_high_u16(vV)); + int16x8_t vDiff4 = vaddq_s16(vH, vR_G); + + int32x4_t vc6 = vdupq_n_s32(v6); + uint32x4_t vLine1 = vmulq_u32(vDiffL, vreinterpretq_u32_s32(vc6)); + uint32x4_t vLine2 = vmulq_u32(vDiffH, vreinterpretq_u32_s32(vc6)); + + float32x4_t vF1 = vcvtq_f32_u32(vV_L); + float32x4_t vF2 = vcvtq_f32_u32(vV_H); + float32x4_t vHF1 = vcvtq_f32_u32(vLine1); + float32x4_t vHF2 = vcvtq_f32_u32(vLine2); + + float32x4_t vXInv1 = vrecpeq_f32(vF1); + float32x4_t vXInv2 = vrecpeq_f32(vF2); + float32x4_t vXInv3 = vrecpeq_f32(vHF1); + float32x4_t vXInv4 = vrecpeq_f32(vHF2); + + float32x4_t vSt1 = vrecpsq_f32(vXInv1, vF1); + float32x4_t vSt2 = vrecpsq_f32(vXInv2, vF2); + float32x4_t vSt3 = vrecpsq_f32(vXInv3, vHF1); + float32x4_t vSt4 = vrecpsq_f32(vXInv4, vHF2); + + vF1 = vmulq_f32(vXInv1, vSt1); + vF2 = vmulq_f32(vXInv2, vSt2); + vHF1 = vmulq_f32(vXInv3, vSt3); + vHF2 = vmulq_f32(vXInv4, vSt4); + + float32x4_t vDivTab = vdupq_n_f32(vsdiv_table); + vSt1 = vmulq_f32(vF1, vDivTab); + vSt2 = vmulq_f32(vF2, vDivTab); + vDivTab = vdupq_n_f32(vhdiv_table); + vSt3 = vmulq_f32(vHF1, vDivTab); + vSt4 = vmulq_f32(vHF2, vDivTab); + + float32x4_t bias = vdupq_n_f32(0.5f); + + vSt1 = vaddq_f32(vSt1, bias); + vSt2 = vaddq_f32(vSt2, bias); + vSt3 = vaddq_f32(vSt3, bias); + vSt4 = vaddq_f32(vSt4, bias); + + uint32x4_t vRes1 = vcvtq_u32_f32(vSt1); + uint32x4_t vRes2 = vcvtq_u32_f32(vSt2); + uint32x4_t vRes3 = vcvtq_u32_f32(vSt3); + uint32x4_t vRes4 = vcvtq_u32_f32(vSt4); + + int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4)); + int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4)); + + uint32x4_t vDiff_Res1 = vmulq_u32(vDiffL, vRes1); + uint32x4_t vDiff_Res2 = vmulq_u32(vDiffH, vRes2); + uint32x4_t vDiff_Res3 = vmulq_u32(vreinterpretq_u32_s32(vH_L), vRes3); + uint32x4_t vDiff_Res4 = vmulq_u32(vreinterpretq_u32_s32(vH_H), vRes4); + + int32x4_t vShift = vdupq_n_s32(vshift); + uint32x4_t vAddRes1 = vaddq_u32(vDiff_Res1, vreinterpretq_u32_s32(vShift)); + uint32x4_t vAddRes2 = vaddq_u32(vDiff_Res2, vreinterpretq_u32_s32(vShift)); + uint32x4_t vAddRes3 = vaddq_u32(vDiff_Res3, vreinterpretq_u32_s32(vShift)); + uint32x4_t vAddRes4 = vaddq_u32(vDiff_Res4, vreinterpretq_u32_s32(vShift)); + int16x4_t vShrRes1 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes1), 8); + int16x4_t vShrRes2 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes2), 8); + int16x4_t vShrRes3 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes3), 8); + int16x4_t vShrRes4 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes4), 8); + + int16x8_t vc0 = vdupq_n_s16((s16)v0); + int8x8_t vShrRes1_s8 = vshrn_n_s16(vcombine_s16(vShrRes1, vShrRes2), 4); + uint16x8_t vCltRes_u16 = vcltq_s16(vcombine_s16(vShrRes3, vShrRes4), vc0); + int8x8_t vShrRes2_s8 = vshrn_n_s16(vcombine_s16(vShrRes3, vShrRes4), 4); + + int8x8_t vCltRes_s8 = vmovn_s16(vreinterpretq_s16_u16(vCltRes_u16)); + int8x8_t vcHRange = vdup_n_s8((s8)vhrange); + uint8x8_t vHResAdd = vand_u8(vreinterpret_u8_s8(vCltRes_s8), vreinterpret_u8_s8(vcHRange)); + int8x8_t vHRes = vadd_s8(vShrRes2_s8, vreinterpret_s8_u8(vHResAdd)); + + uint8x8x3_t vHsv; + vHsv.val[0] = vreinterpret_u8_s8(vHRes); + vHsv.val[1] = vreinterpret_u8_s8(vShrRes1_s8); + vHsv.val[2] = vMax; + + return vHsv; +} + +const u8 fastSaturate8u[] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255 +}; + +inline void convertToHSV(const s32 r, const s32 g, const s32 b, + const s32 &hrange, const s32 &hsv_shift, + u8* dst) +{ + s32 h, s, v = b; + s32 vmin = b, diff; + s32 vr, vg; + + v += fastSaturate8u[g-v+256]; + v += fastSaturate8u[r-v+256]; + vmin -= fastSaturate8u[vmin-g+256]; + vmin -= fastSaturate8u[vmin-r+256]; + + diff = v - vmin; + vr = v == r ? -1 : 0; + vg = v == g ? -1 : 0; + + s = (s32(diff * (255 << hsv_shift) * (1.0f/(f32)v)) + (1 << (hsv_shift-1))) >> hsv_shift; + h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); + h = ((h * s32((hrange << hsv_shift)/(6.f*diff) + 0.5)) + (1 << (hsv_shift-1))) >> hsv_shift; + h += h < 0 ? hrange : 0; + + dst[0] = internal::saturate_cast(h); + dst[1] = (u8)s; + dst[2] = (u8)v; +} + +#define CONVERT_TO_HSV_ASM(loadop, rreg, breg) \ + __asm__ ( \ + #loadop ", [%[in]] @RGB \n\t" \ + "vmin.u8 d3, d0, d1 @VMin (d3) \n\t" \ + "vmax.u8 d6, d0, d1 @V (d6) \n\t" \ + "vmovl.u8 q2, " #rreg " @V16_R (d4,d5) \n\t" \ + "vmovl.u8 q4, d1 @V16_G (d8,d9) \n\t" \ + "vmax.u8 d6, d6, d2 \n\t" \ + "vmin.u8 d3, d3, d2 \n\t" \ + "vmovl.u8 q0, " #breg " @V16_B (d0,d1) \n\t" \ + "vsubl.u8 q8, d6, d3 @V16_Diff (d16,d17) \n\t" \ + \ + "vmovl.u8 q5, d6 @V16_V (d10,d11) \n\t" \ + "vadd.s16 q10, q8, q8 @V16_Diff_2 (d20,d21) \n\t" \ + "vmovl.u16 q9, d16 @V32_Diff_L (d18,d19) \n\t" \ + "vmovl.u16 q11, d17 @V32_Diff_H (d22,d23) \n\t" \ + "vceq.u16 q12, q2, q5 @V==R(d24,d25) \n\t" \ + "vceq.u16 q13, q4, q5 @V==G(d26,d27) \n\t" \ + \ + "vsub.s16 q8, q4, q0 @V16_G-B (d16,d17) \n\t" \ + "vmvn.u16 q15, q12 @V16~R \n\t" \ + "vsub.s16 q6, q0, q2 @V16_B-R (d12,d13) \n\t" \ + "vsub.s16 q7, q2, q4 @V16_R-G (d14,d15) \n\t" \ + "vand.u16 q1, q13, q15 @VMask2 \n\t" \ + "vand.u16 q2, q8, q12 @V16_H(d4,d5) \n\t" \ + "vadd.s16 q4, q6, q10 @V16_H2 \n\t" \ + "vmvn.u16 q12, q13 @V16~G \n\t" \ + "vadd.s16 q6, q10, q10 @VDiff16_4 (d12,d13) \n\t" \ + "vand.u16 q8, q15, q12 @VMask3 \n\t" \ + "vand.u16 q15, q4, q1 @vH2(d30,d31) \n\t" \ + "vadd.s16 q7, q7, q6 @V16_H3 (d14,d15) \n\t" \ + "vadd.s16 q14, q2, q15 @vH16 \n\t" \ + "vmovl.u16 q12, d10 @V32_V_L \n\t" \ + "vand.s16 q7, q7, q8 @vH16 \n\t" \ + "vmovl.u16 q13, d11 @V32_V_H \n\t" \ + "vadd.s16 q2, q14, q7 @V16_Diff_4 \n\t" \ + \ + "vdup.32 q4, %[v6] \n\t" \ + "vmul.u32 q14, q9, q4 \n\t" \ + "vmul.u32 q15, q11, q4 \n\t" \ + "vcvt.f32.u32 q4, q12 @VF1 (d8,d9) \n\t" \ + "vcvt.f32.u32 q8, q13 @VF2 \n\t" \ + "vcvt.f32.u32 q0, q14 @HF1 \n\t" \ + "vcvt.f32.u32 q1, q15 @HF2 \n\t" \ + "vrecpe.f32 q12, q4 @Vxinv \n\t" \ + "vrecpe.f32 q13, q8 @Vxinv \n\t" \ + "vrecpe.f32 q5, q0 @Vxinv \n\t" \ + "vrecpe.f32 q7, q1 @Vxinv \n\t" \ + "vrecps.f32 q14, q12, q4 @Vst1 \n\t" \ + "vrecps.f32 q15, q13, q8 @Vst1 \n\t" \ + "vrecps.f32 q10, q5, q0 @Vst1 \n\t" \ + "vrecps.f32 q6, q7, q1 @Vst1 \n\t" \ + "vmul.f32 q4, q12, q14 \n\t" \ + "vmul.f32 q8, q13, q15 \n\t" \ + "vmul.f32 q0, q5, q10 \n\t" \ + "vmul.f32 q1, q7, q6 \n\t" \ + "vdup.32 q12, %[vsdiv_table] \n\t" \ + "vmul.f32 q14, q4, q12 \n\t" \ + "vmul.f32 q15, q8, q12 \n\t" \ + "vdup.32 q12, %[vhdiv_table] \n\t" \ + "vmul.f32 q10, q0, q12 \n\t" \ + "vmul.f32 q6, q1, q12 \n\t" \ + \ + "vdup.32 q12, %[bias] \n\t" \ + \ + "vadd.f32 q7, q14, q12 \n\t" \ + "vadd.f32 q13, q15, q12 \n\t" \ + "vcvt.u32.f32 q4, q7 \n\t" \ + "vcvt.u32.f32 q8, q13 \n\t" \ + \ + "vadd.f32 q14, q10, q12 \n\t" \ + "vadd.f32 q7, q6, q12 \n\t" \ + "vcvt.u32.f32 q0, q14 \n\t" \ + "vcvt.u32.f32 q1, q7 @Vres \n\t" \ + \ + "vmovl.s16 q7, d4 @V32_H_L (d14,d15) \n\t" \ + "vmovl.s16 q5, d5 @V32_H_H (d10,d11) \n\t" \ + "vmul.u32 q14, q9, q4 \n\t" \ + "vmul.u32 q15, q11, q8 \n\t" \ + "vmul.u32 q10, q7, q0 \n\t" \ + "vmul.u32 q6, q5, q1 \n\t" \ + \ + "vdup.32 q12, %[vshift] \n\t" \ + "vadd.u32 q13, q14, q12 \n\t" \ + "vadd.u32 q8, q15, q12 \n\t" \ + "vadd.u32 q0, q10, q12 \n\t" \ + "vadd.u32 q1, q6, q12 \n\t" \ + "vshrn.s32 d8, q13, #8 \n\t" \ + "vshrn.s32 d9, q8, #8 \n\t" \ + "vshrn.s32 d10, q0, #8 \n\t" \ + "vshrn.s32 d11, q1, #8 \n\t" \ + \ + "vdup.16 q8, %[v0] \n\t" \ + "vshrn.s16 d5, q4, #4 \n\t" \ + "vclt.s16 q9, q5, q8 \n\t" \ + "vshrn.s16 d4, q5, #4 \n\t" \ + \ + "vmovn.s16 d9, q9 \n\t" \ + "vdup.8 d7, %[vhrange] \n\t" \ + "vand.u8 d10, d9, d7 \n\t" \ + "vadd.s8 d4, d4, d10 \n\t" \ + "vst3.8 {d4-d6}, [%[out]] @HSV \n\t" \ + : /*no output*/ \ + : [out] "r" (dst + dj), [in] "r" (src + sj), \ + [vsdiv_table] "r" (vsdiv_table), \ + [vshift] "r" (vshift), \ + [vhdiv_table] "r" (vhdiv_table), \ + [v6] "r" (v6), [vhrange] "r" (vhrange), \ + [v0] "r" (v0), [bias] "r" (bias) \ + : "d0","d1","d2","d3","d4","d5","d6","d7", \ + "d8","d9","d10","d11","d12","d13","d14","d15", \ + "d16","d17","d18","d19","d20","d21","d22","d23", \ + "d24","d25","d26","d27","d28","d29","d30","d31" \ + ); + +#if __GNUC_MINOR__ < 7 + +#define YCRCB_CONSTS \ + register int16x4_t vcYR asm ("d31") = vmov_n_s16(4899); \ + register int16x4_t vcYG asm ("d30") = vmov_n_s16(9617); \ + register int16x4_t vcYB asm ("d29") = vmov_n_s16(1868); \ + register int16x4_t vcCrG asm ("d28") = vmov_n_s16(6860); \ + register int16x4_t vcCrB asm ("d27") = vmov_n_s16(1332); \ + register int16x4_t vcCbR asm ("d26") = vmov_n_s16(2765); \ + register int16x4_t vcCbG asm ("d25") = vmov_n_s16(5427); + +#else + +#define YCRCB_CONSTS \ + const s16 convertCoeffs[] = { 4899, 4899, 4899, 4899, \ + 9617, 9617, 9617, 9617, \ + 1868, 1868, 1868, 1868, \ + 6860, 6860, 6860, 6860, \ + 1332, 1332, 1332, 1332, \ + 2765, 2765, 2765, 2765, \ + 5427, 5427, 5427, 5427 }; \ + const int16x8_t vcYRG = vld1q_s16(convertCoeffs); /*YR and YG*/ \ + const int16x4_t vcYB = vld1_s16(convertCoeffs + 8); /*YB*/ \ + const int16x8_t vcCrGB = vld1q_s16(convertCoeffs + 12); /*CrG and CrB*/ \ + const int16x8_t vcCbRG = vld1q_s16(convertCoeffs + 20); /*CbR and CbG*/ + +#endif + +#define CONVERTTOYCRCB(loadcmd, rreg, greg, breg) \ + __asm__ ( \ + #loadcmd ", [%[in]] @RGB \n\t" \ + "vmovl.u8 q2, " #rreg " @R (d4,d5) \n\t" \ + "vmovl.u8 q3, " #greg " @G (d6,d7) \n\t" \ + "vmovl.u8 q4, " #breg " @B (d8,d9) \n\t" \ + \ + "vshll.u16 q7, d4, #13 @Cr(q7,q8): R \n\t" \ + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" \ + "vshll.u16 q9, d8, #13 @Cb(q9,q10): B \n\t" \ + "vshll.u16 q8, d5, #13 @Cr(q7,q8): R \n\t" \ + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" \ + "vshll.u16 q10, d9, #13 @Cb(q9,q10): B \n\t" \ + \ + "vmlsl.s16 q7, d6, d28 @Cr(q7,q8): RG \n\t" \ + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" \ + "vmlsl.s16 q9, d4, d26 @Cb(q9,q10): BR \n\t" \ + "vmlsl.s16 q8, d7, d28 @Cr(q7,q8): RG \n\t" \ + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" \ + "vmlsl.s16 q10, d5, d26 @Cb(q9,q10): BR \n\t" \ + \ + "vmlsl.s16 q7, d8, d27 @Cr(q7,q8): RGB \n\t" \ + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" \ + "vmlsl.s16 q9, d6, d25 @Cb(q9,q10): BRG \n\t" \ + "vmlsl.s16 q8, d9, d27 @Cr(q7,q8): RGB \n\t" \ + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" \ + "vmlsl.s16 q10, d7, d25 @Cb(q9,q10): BRG \n\t" \ + \ + "vrshrn.s32 d4, q7, #14 @Cr -> q2 \n\t" \ + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" \ + "vrshrn.s32 d6, q9, #14 @Cb -> q3 \n\t" \ + "vrshrn.s32 d5, q8, #14 @Cr -> q2 \n\t" \ + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" \ + "vrshrn.s32 d7, q10, #14 @Cb -> q3 \n\t" \ + \ + "vmov.s16 q5, #128 \n\t" \ + "vmov.s16 q6, #128 \n\t" \ + "vadd.i16 q5, q2 @Cr -> q5 \n\t" \ + "vadd.i16 q6, q3 @Cb -> q6 \n\t" \ + \ + "vqmovn.u16 d4, q4 \n\t" \ + "vqmovun.s16 d5, q5 \n\t" \ + "vqmovun.s16 d6, q6 \n\t" \ + \ + "vst3.8 {d4-d6}, [%[out]] \n\t" \ + : /*no output*/ \ + : [out] "r" (dst + dj), [in] "r" (src + sj), \ + "w" (vcYR), "w" (vcYG), "w" (vcYB), \ + "w" (vcCrB), "w" (vcCrG), "w" (vcCbG), "w" (vcCbR) \ + : "d0","d1","d2","d3","d4","d5","d6","d7", \ + "d8","d9","d10","d11","d12","d13","d14","d15", \ + "d16","d17","d18","d19","d20","d21" \ + ); + + +inline uint8x8x3_t convertToYCrCb( const int16x8_t& vR, const int16x8_t& vG, const int16x8_t& vB, + const int16x8_t& vcYRG, const int16x4_t& vcYB, + const int16x8_t& vcCrGB, const int16x8_t& vcCbRG ) +{ + int32x4_t vCrL = vshll_n_s16(vget_low_s16(vR), 13); // R + int32x4_t vCrH = vshll_n_s16(vget_high_s16(vR), 13); // R + int32x4_t vYL = vmull_s16(vget_low_s16(vG), vget_high_s16(vcYRG)); // G + int32x4_t vYH = vmull_s16(vget_high_s16(vG), vget_high_s16(vcYRG)); // G + int32x4_t vCbL = vshll_n_s16(vget_low_s16(vB), 13); // B + int32x4_t vCbH = vshll_n_s16(vget_high_s16(vB), 13); // B + + vCrL = vmlsl_s16(vCrL, vget_low_s16(vG), vget_low_s16(vcCrGB)); // RG + vCrH = vmlsl_s16(vCrH, vget_high_s16(vG), vget_low_s16(vcCrGB)); // RG + vYL = vmlal_s16(vYL, vget_low_s16(vB), vcYB); // GB + vYH = vmlal_s16(vYH, vget_high_s16(vB), vcYB); // GB + vCbL = vmlsl_s16(vCbL, vget_low_s16(vR), vget_low_s16(vcCbRG)); // BR + vCbH = vmlsl_s16(vCbH, vget_high_s16(vR), vget_low_s16(vcCbRG)); // BR + + vCrL = vmlsl_s16(vCrL, vget_low_s16(vB), vget_high_s16(vcCrGB)); // RGB + vCrH = vmlsl_s16(vCrH, vget_high_s16(vB), vget_high_s16(vcCrGB)); // RGB + vYL = vmlal_s16(vYL, vget_low_s16(vR), vget_low_s16(vcYRG)); // GBR + vYH = vmlal_s16(vYH, vget_high_s16(vR), vget_low_s16(vcYRG)); // GBR + vCbL = vmlsl_s16(vCbL, vget_low_s16(vG), vget_high_s16(vcCbRG)); // BRG + vCbH = vmlsl_s16(vCbH, vget_high_s16(vG), vget_high_s16(vcCbRG)); // BRG + + int16x4_t vCrL_ = vrshrn_n_s32(vCrL, 14); + int16x4_t vCrH_ = vrshrn_n_s32(vCrH, 14); + int16x4_t vYL_ = vrshrn_n_s32(vYL, 14); + int16x4_t vYH_ = vrshrn_n_s32(vYH, 14); + int16x4_t vCbL_ = vrshrn_n_s32(vCbL, 14); + int16x4_t vCbH_ = vrshrn_n_s32(vCbH, 14); + + int16x8_t vCr = vmovq_n_s16(128); + int16x8_t vCb = vmovq_n_s16(128); + + vCr = vaddq_s16(vCr, vcombine_s16(vCrL_, vCrH_)); + vCb = vaddq_s16(vCb, vcombine_s16(vCbL_, vCbH_)); + + uint8x8x3_t vYCrCb; + vYCrCb.val[0] = vqmovn_u16(vreinterpretq_u16_s16(vcombine_s16(vYL_, vYH_))); + vYCrCb.val[1] = vqmovun_s16(vCr); + vYCrCb.val[2] = vqmovun_s16(vCb); + + return vYCrCb; +} + +#define S_CONVERTTOYCRCB(R, G, B) \ + s32 Y = (R * 4899 + G * 9617 + B * 1868 + (1 << 13)) >> 14; \ + s32 Cr = 128 + ((R * 8192 - G * 6860 - B * 1332 + (1 << 13)) >> 14); \ + s32 Cb = 128 + ((R * (-2765) - G * 5427 + B * 8192 + (1 << 13)) >> 14); \ + dst[dj + 0] = internal::saturate_cast(Y); \ + dst[dj + 1] = internal::saturate_cast(Cr); \ + dst[dj + 2] = internal::saturate_cast(Cb); + +#define COEFF_Y ( 149) +#define COEFF_BU ( 129) +#define COEFF_RV ( 102) +#define COEFF_GU ( 25) +#define COEFF_GV ( 52) +#define COEFF_R (-14248) +#define COEFF_G ( 8663) +#define COEFF_B (-17705) + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 +#define YUV420ALPHA3_CONST +#define YUV420ALPHA4_CONST register uint8x16_t c255 asm ("q13") = vmovq_n_u8(255); +#define YUV420ALPHA3_CONVERT +#define YUV420ALPHA4_CONVERT , "w" (c255) +#define YUV420STORE1CMD3 "vst3.8 {d20, d22, d24}" +#define YUV420STORE2CMD3 "vst3.8 {d21, d23, d25}" +#define YUV420STORE1CMD4 "vst4.8 {d20, d22, d24, d26}" +#define YUV420STORE2CMD4 "vst4.8 {d21, d23, d25, d27}" + +#define YUV420_CONSTS(cn, bIdx, vIdx) \ + register const s32 cR = s16(COEFF_R); \ + register const s32 cG = s16(COEFF_G); \ + register const s32 cB = s16(COEFF_B); \ + \ + register uint8x16_t vc16 asm ("q15") = vmovq_n_u8(16); \ + register uint8x8_t cGU asm ("d14") = vmov_n_u8(COEFF_GU); \ + register uint8x8_t cGV asm ("d15") = vmov_n_u8(COEFF_GV); \ + register uint8x8_t cRV asm ("d16") = vmov_n_u8(COEFF_RV); \ + register uint8x8_t cBU asm ("d17") = vmov_n_u8(COEFF_BU); \ + register uint8x16_t cRGBY asm ("q3") = vmovq_n_u8(COEFF_Y); \ + YUV420ALPHA##cn##_CONST + +#define CONVERTYUV420TORGB(cn, ureg, vreg, rreg, breg) \ + __asm__ ( \ + "vld2.8 {d0-d1}, [%[inUV]] @UV \n\t" \ + "vdup.16 q4, %[cG] @cG \n\t" \ + "vld2.8 {d2-d3}, [%[inY1]] @YY \n\t" \ + "vdup.16 "#rreg", %[cR] @cR \n\t" \ + "vld2.8 {d4-d5}, [%[inY2]] @YY \n\t" \ + "vdup.16 "#breg", %[cB] @cB \n\t" \ + "vmlsl.u8 q4, "#ureg", d14 @cG-25u \n\t" \ + "vmax.u8 q1, q15 @max(Y,16) \n\t" \ + "vmlal.u8 "#rreg", "#vreg", d16 @cR+102*v \n\t" \ + "vmlal.u8 "#breg", "#ureg", d17 @cB+129*u \n\t" \ + "vmax.u8 q2, q15 @max(Y,16) \n\t" \ + "vmlsl.u8 q4, "#vreg", d15 @cG-25u-52v \n\t" \ + /*q10,q11,q12,q13 - for output*/ \ + "vmull.u8 q9, d3, d6 @h 149*y \n\t" \ + "vmull.u8 q10, d2, d7 @l 149*y \n\t" \ + "vshr.u16 q9, #1 @h (149*y)/2 \n\t" \ + "vshr.u16 q10, #1 @l (149*y)/2 \n\t" \ + \ + "vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \ + "vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \ + "vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \ + "vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \ + "vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \ + "vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \ + \ + "vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \ + "vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \ + "vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \ + "vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \ + "vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \ + "vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \ + \ + "vzip.8 d22, d23 @G \n\t" \ + "vzip.8 d20, d21 @R \n\t" \ + "vzip.8 d24, d25 @B \n\t" \ + \ + YUV420STORE1CMD##cn", [%[out1]] \n\t" \ + YUV420STORE2CMD##cn", [%[out1x]] \n\t" \ + \ + "vmull.u8 q9, d5, d6 @h 149*y \n\t" \ + "vmull.u8 q10, d4, d7 @l 149*y \n\t" \ + "vshr.u16 q9, #1 @h (149*y)/2 \n\t" \ + "vshr.u16 q10, #1 @l (149*y)/2 \n\t" \ + \ + "vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \ + "vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \ + "vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \ + "vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \ + "vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \ + "vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \ + \ + "vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \ + "vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \ + "vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \ + "vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \ + "vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \ + "vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \ + \ + "vzip.8 d22, d23 @G \n\t" \ + "vzip.8 d20, d21 @R \n\t" \ + "vzip.8 d24, d25 @B \n\t" \ + \ + YUV420STORE1CMD##cn", [%[out2]] \n\t" \ + YUV420STORE2CMD##cn", [%[out2x]] \n\t" \ + \ + : /*no output*/ \ + : [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \ + [out1x] "r" (dst1 + dj+cn*8), [out2x] "r" (dst2 + dj+cn*8), \ + [inUV] "r" (uv+j), [inY1] "r" (y1+j), [inY2] "r" (y2+j), \ + [cR] "r" (cR), [cG] "r" (cG), [cB] "r" (cB), \ + "w" (vc16), "w" (cGU), "w" (cGV), "w" (cBU), "w" (cRV), "w" (cRGBY) YUV420ALPHA##cn##_CONVERT \ + : "d0","d1","d2","d3","d4","d5","d8","d9","d10","d11","d12", \ + "d13","d18","d19","d20","d21","d22","d23","d24","d25" \ + ); + +#else + +template +struct _convertYUV420Internals +{ + uint16x8_t vc14216; + uint16x8_t vc17672; + uint16x8_t vc8696; + uint8x8_t vc102; + uint8x8_t vc25; + uint8x8_t vc129; + uint8x8_t vc52; + uint16x8_t vc_1; + uint8x8_t vc149; + uint8x8_t vc16; + _convertYUV420Internals() + { + vc14216 = vdupq_n_u16(-COEFF_R); + vc17672 = vdupq_n_u16(-COEFF_B); + vc8696 = vdupq_n_u16(COEFF_G); + vc102 = vdup_n_u8(COEFF_RV); + vc25 = vdup_n_u8(COEFF_GU); + vc129 = vdup_n_u8(COEFF_BU); + vc52 = vdup_n_u8(COEFF_GV); + vc_1 = vdupq_n_u16((uint16_t)-1); + vc149 = vdup_n_u8(COEFF_Y); + vc16 = vdup_n_u8(16); + } + + inline void UVrgbToRGB( const int16x8_t &ruv, const int16x8_t &guv, const int16x8_t &buv, + const u8 *y, uint8x16x3_t &rgbl ) + { + //y get line + uint8x8x2_t yl = vld2_u8(y); + yl.val[0] = vmax_u8(yl.val[0], vc16); + yl.val[1] = vmax_u8(yl.val[1], vc16); + + //y part line + uint16x8_t yodd1 = vmlal_u8(vc_1, yl.val[0], vc149); //(-1+149*y) + uint16x8_t yevn1 = vmlal_u8(vc_1, yl.val[1], vc149); //(-1+149*y) + int16x8_t yodd1h = (int16x8_t)vshrq_n_u16(yodd1, 1); //(-1+149*y)/2 + int16x8_t yevn1h = (int16x8_t)vshrq_n_u16(yevn1, 1); //(-1+149*y)/2 + + //y line calc rgb + int16x8_t rodd1w = vhsubq_s16(yodd1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2 + int16x8_t gevn1w = vhaddq_s16(yevn1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2 + int16x8_t bodd1w = vhsubq_s16(yodd1h, buv); //((-1+149*y)/2 - (17672-129*u))/2 + int16x8_t revn1w = vhsubq_s16(yevn1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2 + int16x8_t godd1w = vhaddq_s16(yodd1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2 + int16x8_t bevn1w = vhsubq_s16(yevn1h, buv); //((-1+149*y)/2 - (17672-129*u))/2 + + //y line clamp + narrow + uint8x8_t rodd1n = vqshrun_n_s16(rodd1w, 5); + uint8x8_t revn1n = vqshrun_n_s16(revn1w, 5); + uint8x8_t godd1n = vqshrun_n_s16(godd1w, 5); + uint8x8x2_t r1 = vzip_u8 (rodd1n, revn1n); + uint8x8_t gevn1n = vqshrun_n_s16(gevn1w, 5); + uint8x8_t bodd1n = vqshrun_n_s16(bodd1w, 5); + uint8x8x2_t g1 = vzip_u8 (godd1n, gevn1n); + uint8x8_t bevn1n = vqshrun_n_s16(bevn1w, 5); + uint8x8x2_t b1 = vzip_u8 (bodd1n, bevn1n); + rgbl.val[2 - bIdx] = vcombine_u8(r1.val[0], r1.val[1]); + rgbl.val[1] = vcombine_u8(g1.val[0], g1.val[1]); + rgbl.val[0 + bIdx] = vcombine_u8(b1.val[0], b1.val[1]); + } +}; + +template +struct _convertYUV420 +{ + _convertYUV420Internals convertYUV420Internals; + + inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv, + u8 *dst1, u8 *dst2 ) + { + uint8x8x2_t raw_uv = vld2_u8(uv); + uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u) + int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v) + + int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u) + int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v)) + + uint8x16x3_t rgbl; + //y line1 + convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl); + vst3q_u8(dst1, rgbl); + //y line2 + convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl); + vst3q_u8(dst2, rgbl); + } +}; + +template +struct _convertYUV420<4, bIdx, vIdx> +{ + _convertYUV420Internals convertYUV420Internals; + + inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv, + u8 *dst1, u8 *dst2 ) + { + uint8x8x2_t raw_uv = vld2_u8(uv); + uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u) + int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v) + + int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u) + int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v)) + + union { uint8x16x4_t v4; uint8x16x3_t v3; } rgbl; + rgbl.v4.val[3] = vdupq_n_u8(0xff); + //y line1 + convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl.v3); + vst4q_u8(dst1, rgbl.v4); + //y line2 + convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl.v3); + vst4q_u8(dst2, rgbl.v4); + } +}; + +#define YUV420_CONSTS(cn, bIdx, vIdx) _convertYUV420 convertYUV420; + +#endif + +template inline void fillAlpha(u8 *, u8 *){} +template <> inline void fillAlpha<4>(u8 *dst1, u8 *dst2) +{ + dst1[3] = 255; + dst1[7] = 255; + dst2[3] = 255; + dst2[7] = 255; +} +template +inline void convertYUV420ToRGB(const u8 *y1, const u8 *y2, const u8 *uv, u8* dst1, u8 *dst2) +{ + int Y11 = y1[0]; + int Y12 = y1[1]; + int Y21 = y2[0]; + int Y22 = y2[1]; + + int U = uv[1 - vIdx]; + int V = uv[vIdx]; + + int y11 = (COEFF_Y * std::max(16, Y11)) >> 1; + int y12 = (COEFF_Y * std::max(16, Y12)) >> 1; + int y21 = (COEFF_Y * std::max(16, Y21)) >> 1; + int y22 = (COEFF_Y * std::max(16, Y22)) >> 1; + + int uvR = COEFF_R + COEFF_RV * V; + int uvG = COEFF_G - COEFF_GU * U - COEFF_GV * V; + int uvB = COEFF_B + COEFF_BU * U; + + dst1[2-bIdx] = internal::saturate_cast((((y11 + uvR) >> 1) + (1 << 4)) >> 5); + dst1[1] = internal::saturate_cast((((y11 + uvG) >> 1) + (1 << 4)) >> 5); + dst1[bIdx] = internal::saturate_cast((((y11 + uvB) >> 1) + (1 << 4)) >> 5); + + dst1[cn+2-bIdx] = internal::saturate_cast((((y12 + uvR) >> 1) + (1 << 4)) >> 5); + dst1[cn+1] = internal::saturate_cast((((y12 + uvG) >> 1) + (1 << 4)) >> 5); + dst1[cn+bIdx] = internal::saturate_cast((((y12 + uvB) >> 1) + (1 << 4)) >> 5); + + dst2[2-bIdx] = internal::saturate_cast((((y21 + uvR) >> 1) + (1 << 4)) >> 5); + dst2[1] = internal::saturate_cast((((y21 + uvG) >> 1) + (1 << 4)) >> 5); + dst2[bIdx] = internal::saturate_cast((((y21 + uvB) >> 1) + (1 << 4)) >> 5); + + dst2[cn+2-bIdx] = internal::saturate_cast((((y22 + uvR) >> 1) + (1 << 4)) >> 5); + dst2[cn+1] = internal::saturate_cast((((y22 + uvG) >> 1) + (1 << 4)) >> 5); + dst2[cn+bIdx] = internal::saturate_cast((((y22 + uvB) >> 1) + (1 << 4)) >> 5); + + fillAlpha(dst1, dst2); +} + +// converts R, G, B (B, G, R) pixels to RGB(BGR)565 format respectively +inline uint8x16x2_t convertTo565( const uint8x16_t& vR, const uint8x16_t& vG, const uint8x16_t& vB ) +{ + uint8x16x2_t vRgb565; // rrrrRRRR ggggGGGG bbbbBBBB + + vRgb565.val[1] = vsriq_n_u8(vB, vG, 5); // xxxxxxxx bbbbBggg + vRgb565.val[0] = vshlq_n_u8(vG, 3); // gGGGG000 bbbbBggg + vRgb565.val[0] = vsriq_n_u8(vRgb565.val[0], vR, 3); // gGGrrrrR bbbbBggg + + return vRgb565; +} +inline void convertTo565( const u16 R, const u16 G, const u16 B, u8 * dst ) +{ + *((u16*)dst) = (R >> 3)|((G&~3) << 3)|((B&~7) << 8); +} +#endif + +} //namespace + +void rgb2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + const s32 hsv_shift = 12; +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + register const f32 bias = 0.5f; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d0, d2) +#else + uint8x8x3_t vRgb = vld3_u8(src + sj); + uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange); + vst3_u8(dst + dj, vHsv); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)hrange; +#endif +} + +void rgbx2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + const s32 hsv_shift = 12; +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + register const f32 bias = 0.5f; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d0, d2) +#else + uint8x8x4_t vRgb = vld4_u8(src + sj); + uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange); + vst3_u8(dst + dj, vHsv); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)hrange; +#endif +} + +void bgr2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + const s32 hsv_shift = 12; +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + register const f32 bias = 0.5f; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d2, d0) +#else + uint8x8x3_t vRgb = vld3_u8(src + sj); + uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange); + vst3_u8(dst + dj, vHsv); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)hrange; +#endif +} + +void bgrx2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + const s32 hsv_shift = 12; +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + register const f32 bias = 0.5f; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d2, d0) +#else + uint8x8x4_t vRgb = vld4_u8(src + sj); + uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange); + vst3_u8(dst + dj, vHsv); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)hrange; +#endif +} + +void rgbx2bgr565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw16; sj += 64, dj += 32, j += 16) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld4.8 {d2, d4, d6, d8}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t" + "vld4.8 {d3, d5, d7, d9}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vst2.8 {d0, d2}, [%[out0]] \n\t" + "vst2.8 {d1, d3}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [out1] "r" (dst + dj + 16), + [in0] "r" (src + sj), + [in1] "r" (src + sj + 32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + uint8x16x4_t vRgba = vld4q_u8(src + sj); + uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]); + vst2q_u8(dst + dj, vVal565); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 2) + { + convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2bgr565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw16; sj += 48, dj += 32, j += 16) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld3.8 {d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t" + "vld3.8 {d3, d5, d7}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vst2.8 {d0, d2}, [%[out0]] \n\t" + "vst2.8 {d1, d3}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [out1] "r" (dst + dj + 16), + [in0] "r" (src + sj), + [in1] "r" (src + sj + 24) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); +#else + uint8x16x3_t vRgba = vld3q_u8(src + sj); + uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]); + vst2q_u8(dst + dj, vVal565); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 2) + { + convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2rgb565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw16; sj += 64, dj += 32, j += 16) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld4.8 {d0, d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 \n\t" + "vld4.8 {d1, d3, d5, d7}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB aaaaAAAA \n\t" + "vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg aaaaAAAA \n\t" + "vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg aaaaAAAA \n\t" + "vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg aaaaAAAA \n\t" + "vst2.8 {d2, d4}, [%[out0]] \n\t" + "vst2.8 {d3, d5}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [out1] "r" (dst + dj + 16), + [in0] "r" (src + sj), + [in1] "r" (src + sj + 32) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); +#else + uint8x16x4_t vRgba = vld4q_u8(src + sj); + uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]); + vst2q_u8(dst + dj, vVal565); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 2) + { + convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2rgb565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw16; sj += 48, dj += 32, j += 16) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld3.8 {d0, d2, d4}, [%[in0]] @ q0 q1 q2 q3 \n\t" + "vld3.8 {d1, d3, d5}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg xxxxxxxx \n\t" + "vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg xxxxxxxx \n\t" + "vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg xxxxxxxx \n\t" + "vst2.8 {d2, d4}, [%[out0]] \n\t" + "vst2.8 {d3, d5}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [out1] "r" (dst + dj + 16), + [in0] "r" (src + sj), + [in1] "r" (src + sj + 24) + : "d0","d1","d2","d3","d4","d5" + ); +#else + uint8x16x3_t vRgba = vld3q_u8(src + sj); + uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]); + vst2q_u8(dst + dj, vVal565); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 2) + { + convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YCRCB_CONSTS + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTTOYCRCB(vld3.8 {d0-d2}, d0, d1, d2) +#else + uint8x8x3_t vRgb = vld3_u8(src + sj); + int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[0])); + int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[1])); + int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[2])); + uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG); + vst3_u8(dst + dj, vYCrCb); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YCRCB_CONSTS + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTTOYCRCB(vld4.8 {d0-d3}, d0, d1, d2) +#else + uint8x8x4_t vRgba = vld4_u8(src + sj); + int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[0])); + int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[1])); + int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[2])); + uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG); + vst3_u8(dst + dj, vYCrCb); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bgr2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YCRCB_CONSTS + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTTOYCRCB(vld3.8 {d0-d2}, d2, d1, d0) +#else + uint8x8x3_t vBgr = vld3_u8(src + sj); + int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[0])); + int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[1])); + int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[2])); + uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG); + vst3_u8(dst + dj, vYCrCb); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bgrx2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YCRCB_CONSTS + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTTOYCRCB(vld4.8 {d0-d3}, d2, d1, d0) +#else + uint8x8x4_t vBgra = vld4_u8(src + sj); + int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[0])); + int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[1])); + int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[2])); + uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG); + vst3_u8(dst + dj, vYCrCb); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420sp2rgb(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + // input data: + ////////////// Y matrix: + // {y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16} + // {Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15, Y16} + ////////////// UV matrix: + // {v12, u12, v34, u34, v56, u56, v78, u78, v90 u90, V12, U12, V34, U34, V56, U56} + + // fp version + // R = 1.164(Y - 16) + 1.596(V - 128) + // G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) + // B = 1.164(Y - 16) + 2.018(U - 128) + + // integer version + // R = [((149*y)/2 + (-14248+102*v) )/2]/32 + // G = [((149*y)/2 + ((8663- 25*u)-52*v))/2]/32 + // B = [((149*y)/2 + (-17705+129*u) )/2]/32 + + // error estimation: + //Rerr = 0.0000625 * y − 0.00225 * v − 0.287 + //Gerr = 0.0000625 * y + 0.0005 * v + 0.000375 * u + 0.128625 + //Berr = 0.0000625 * y − 0.002375 * u - 0.287375 + + //real error test: + //================= + //R: 1 less: 520960 == 3.11% of full space + //G: 1 less: 251425 == 1.50% of full space + //B: 1 less: 455424 == 2.71% of full space + //================= + //R: 1 more: 642048 == 3.83% of full space + //G: 1 more: 192458 == 1.15% of full space + //B: 1 more: 445184 == 2.65% of full space + + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(3, 2, 0) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 48, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(3, d1, d0, q5, q6) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 6) + { + convertYUV420ToRGB<3, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420sp2rgbx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(4, 2, 0) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 64, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(4, d1, d0, q5, q6) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 8) + { + convertYUV420ToRGB<4, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420i2rgb(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(3, 2, 1) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 48, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(3, d0, d1, q5, q6) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 6) + { + convertYUV420ToRGB<3, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420i2rgbx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(4, 2, 1) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 64, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(4, d0, d1, q5, q6) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 8) + { + convertYUV420ToRGB<4, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420sp2bgr(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(3, 0, 0) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 48, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(3, d1, d0, q6, q5) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 6) + { + convertYUV420ToRGB<3, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420sp2bgrx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(4, 0, 0) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 64, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(4, d1, d0, q6, q5) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 8) + { + convertYUV420ToRGB<4, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420i2bgr(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(3, 0, 1) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 48, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(3, d0, d1, q6, q5) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 6) + { + convertYUV420ToRGB<3, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420i2bgrx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(4, 0, 1) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 64, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(4, d0, d1, q6, q5) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 8) + { + convertYUV420ToRGB<4, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/common.cpp b/3rdparty/carotene/src/common.cpp new file mode 100644 index 0000000000..c85b0123b6 --- /dev/null +++ b/3rdparty/carotene/src/common.cpp @@ -0,0 +1,108 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include +#include + +#include "common.hpp" + +namespace CAROTENE_NS { + +bool isSupportedConfiguration() +{ +#ifdef CAROTENE_NEON + return true; +#else + return false; +#endif +} + +namespace internal { + +void assertSupportedConfiguration(bool parametersSupported) +{ + if (!isSupportedConfiguration()) { + std::cerr << "internal error: attempted to use an unavailable function" << std::endl; + std::abort(); + } + + if (!parametersSupported) { + std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl; + std::abort(); + } +} + +ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin) +{ + ptrdiff_t p = _p + (ptrdiff_t)startMargin; + size_t len = _len + startMargin + endMargin; + if( (size_t)p < len ) + return _p; + else if( borderType == BORDER_MODE_REPLICATE ) + p = p < 0 ? 0 : (ptrdiff_t)len - 1; + else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 ) + { + s32 delta = borderType == BORDER_MODE_REFLECT101; + if( len == 1 ) + return 0; + do + { + if( p < 0 ) + p = -p - 1 + delta; + else + p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta; + } + while( (size_t)p >= len ); + } + else if( borderType == BORDER_MODE_WRAP ) + { + if( p < 0 ) + p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len; + if( p >= (ptrdiff_t)len ) + p %= (ptrdiff_t)len; + } + else if( borderType == BORDER_MODE_CONSTANT ) + p = -1; + else + internal::assertSupportedConfiguration(false); + return p - (ptrdiff_t)startMargin; +} + +} // namespace internal +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/common.hpp b/3rdparty/carotene/src/common.hpp new file mode 100644 index 0000000000..e46231a58a --- /dev/null +++ b/3rdparty/carotene/src/common.hpp @@ -0,0 +1,96 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_COMMON_HPP +#define CAROTENE_SRC_COMMON_HPP + +#include +#include + +#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON) +#define CAROTENE_NEON +#endif + +#ifdef CAROTENE_NEON +#include +#include "intrinsics.hpp" +#endif + +#include +#include "saturate_cast.hpp" + +namespace CAROTENE_NS { namespace internal { + +inline void prefetch(const void *ptr, size_t offset = 32*10) +{ +#if defined __GNUC__ + __builtin_prefetch(reinterpret_cast(ptr) + offset); +#elif defined _MSC_VER && defined CAROTENE_NEON + __prefetch(reinterpret_cast(ptr) + offset); +#else + (void)ptr; + (void)offset; +#endif +} + +template +inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row) +{ + char *baseRaw = const_cast(reinterpret_cast(base)); + return reinterpret_cast(baseRaw + ptrdiff_t(row) * stride); +} + +void assertSupportedConfiguration(bool parametersSupported = true); + +ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0); + +/*! + * Aligns pointer by the certain number of bytes + * + * This small inline function aligns the pointer by the certain number of bytes by shifting + * it forward by 0 or a positive offset. + */ +template inline T* alignPtr(T* ptr, size_t n=sizeof(T)) +{ + return (T*)(((size_t)ptr + n-1) & -n); +} + +}} + +#endif diff --git a/3rdparty/carotene/src/convert.cpp b/3rdparty/carotene/src/convert.cpp new file mode 100644 index 0000000000..2f95e29cb3 --- /dev/null +++ b/3rdparty/carotene/src/convert.cpp @@ -0,0 +1,1331 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ + void convert(const Size2D &_size, \ + const T1 * srcBase, ptrdiff_t srcStride, \ + T2 * dstBase, ptrdiff_t dstStride) \ + { \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (srcStride == dstStride && \ + srcStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + const ptrdiff_t sstep = srcStride / sizeof(T1); \ + const ptrdiff_t dstep = dstStride / sizeof(T2); \ + const size_t w = size.width & ~(SIMD_SIZE-1); \ + if (size.width >= SIMD_SIZE) \ + { \ + const T1* _src = srcBase; \ + T2* _dst = dstBase; \ + CVTINIT \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + CVTROW \ + } \ + if(w < size.width) \ + { \ + const T1* _src = srcBase; \ + T2* _dst = dstBase; \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + for(size_t i = w; i < size.width; i++ ) \ + _dst[i] = internal::saturate_cast(_src[i]); \ + } \ + } + +#else + +#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ + void convert(const Size2D &, \ + const T1 *, ptrdiff_t, \ + T2 *, ptrdiff_t) \ + { \ + internal::assertSupportedConfiguration(); \ + } + +#endif + +CVT_FUNC(u8, s8, 16, + uint8x16_t v127 = vdupq_n_u8(127);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vu8 = vld1q_u8(_src + i); + int8x16_t vu1 = vreinterpretq_s8_u8(vminq_u8(vu8, v127)); + vst1q_s8(_dst + i, vu1); + } +}) + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(u8, u16, 16, + register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vst2.8 {d0,d2}, [%[dst1]] \n\t" + "vst2.8 {d1,d3}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (zero0) + : "d0","d1" + ); + } +}) +#else +CVT_FUNC(u8, u16, 16, + uint8x16x2_t vline; + vline.val[1] = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + vline.val[0] = vld1q_u8(_src + i); + vst2q_u8((uint8_t*)(_dst + i), vline); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(u8, s32, 16, + register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0); + register uint8x16_t zero1 asm ("q2") = vmovq_n_u8(0); + register uint8x16_t zero2 asm ("q3") = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vst4.8 {d0,d2,d4,d6}, [%[dst1]] \n\t" + "vst4.8 {d1,d3,d5,d7}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (zero0), "w" (zero1), "w" (zero2) + : "d0","d1" + ); + } +}) +#else +CVT_FUNC(u8, s32, 16, + uint8x16x4_t vline; + vline.val[1] = vmovq_n_u8(0); + vline.val[2] = vmovq_n_u8(0); + vline.val[3] = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + vline.val[0] = vld1q_u8(_src + i); + vst4q_u8((uint8_t*)(_dst + i), vline); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(u8, f32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmovl.u8 q1, d0 \n\t" + "vmovl.u8 q2, d1 \n\t" + "vmovl.u16 q3, d2 \n\t" + "vmovl.u16 q4, d3 \n\t" + "vmovl.u16 q5, d4 \n\t" + "vmovl.u16 q6, d5 \n\t" + "vcvt.f32.u32 q7, q3 \n\t" + "vcvt.f32.u32 q8, q4 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vst1.32 {d14-d15}, [%[dst1]] \n\t" + "vst1.32 {d16-d17}, [%[dst2]] \n\t" + "vst1.32 {d18-d19}, [%[dst3]] \n\t" + "vst1.32 {d20-d21}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" + ); + } +}) +#else +CVT_FUNC(u8, f32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline_u8 = vld1q_u8(_src + i); + + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8(vline_u8)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline_u8)); + + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16(vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16(vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + + vst1q_f32(_dst + i, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + vst1q_f32(_dst + i + 8, vline3_f32); + vst1q_f32(_dst + i + 12, vline4_f32); + } +}) +#endif + +CVT_FUNC(s8, u8, 16, + int8x16_t vZero = vdupq_n_s8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vu8 = vld1q_s8(_src + i); + uint8x16_t vu1 = vreinterpretq_u8_s8(vmaxq_s8(vu8, vZero)); + vst1q_u8(_dst + i, vu1); + } +}) + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(s8, u16, 16, + register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmax.s8 q0, q1 \n\t" + "vst2.8 {d0,d2}, [%[dst1]] \n\t" + "vst2.8 {d1,d3}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (zero0) + : "d0","d1" + ); + } +}) +#else +CVT_FUNC(s8, u16, 16, + int8x16x2_t vline_s8; + vline_s8.val[1] = vmovq_n_s8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + vline_s8.val[0] = vld1q_s8(_src + i); + vline_s8.val[0] = vmaxq_s8(vline_s8.val[0], vline_s8.val[1]); + vst2q_s8((int8_t*)(_dst + i), vline_s8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s8, s16, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmovl.s8 q1, d0 \n\t" + "vmovl.s8 q2, d1 \n\t" + "vst1.16 {d2-d3}, [%[dst1]] \n\t" + "vst1.16 {d4-d5}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s8, s16, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline_s8 = vld1q_s8(_src + i); + + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8)); + + vst1q_s16(_dst + i, vline1_s16); + vst1q_s16(_dst + i + 8, vline2_s16); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(s8, s32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmovl.s8 q1, d0 \n\t" + "vmovl.s8 q2, d1 \n\t" + "vmovl.s16 q3, d2 \n\t" + "vmovl.s16 q4, d3 \n\t" + "vmovl.s16 q5, d4 \n\t" + "vmovl.s16 q6, d5 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + "vst1.32 {d10-d11}, [%[dst3]] \n\t" + "vst1.32 {d12-d13}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +}) +#else +CVT_FUNC(s8, s32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline_s8 = vld1q_s8(_src + i); + + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8)); + + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + + vst1q_s32(_dst + i, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + vst1q_s32(_dst + i + 8, vline3_s32); + vst1q_s32(_dst + i + 12, vline4_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s8, f32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmovl.s8 q1, d0 \n\t" + "vmovl.s8 q2, d1 \n\t" + "vmovl.s16 q3, d2 \n\t" + "vmovl.s16 q4, d3 \n\t" + "vmovl.s16 q5, d4 \n\t" + "vmovl.s16 q6, d5 \n\t" + "vcvt.f32.s32 q7, q3 \n\t" + "vcvt.f32.s32 q8, q4 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vst1.32 {d14-d15}, [%[dst1]] \n\t" + "vst1.32 {d16-d17}, [%[dst2]] \n\t" + "vst1.32 {d18-d19}, [%[dst3]] \n\t" + "vst1.32 {d20-d21}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" + ); + } +}) +#else +CVT_FUNC(s8, f32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline_s8 = vld1q_s8(_src + i); + + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8)); + + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + + vst1q_f32(_dst + i, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + vst1q_f32(_dst + i + 8, vline3_f32); + vst1q_f32(_dst + i + 12, vline4_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(u16, u8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src1]] \n\t" + "vqmovn.u16 d4, q0 \n\t" + "vld1.8 {d2-d3}, [%[src2]] \n\t" + "vqmovn.u16 d5, q1 \n\t" + "vst1.8 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 8), + [dst] "r" (_dst + i + 0) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(u16, u8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint16x8_t vline1_u16 = vld1q_u16(_src + i); + uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8); + + uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16); + uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16); + + vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8)); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(u16, s8, 16, + register uint8x16_t v127 asm ("q4") = vmovq_n_u8(127);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src1]] \n\t" + "vqmovn.u16 d4, q0 \n\t" + "vld1.8 {d2-d3}, [%[src2]] \n\t" + "vqmovn.u16 d5, q1 \n\t" + "vmin.u8 q3, q2, q4 \n\t" + "vst1.8 {d6-d7}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 8), + [dst] "r" (_dst + i + 0), + "w" (v127) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); + } +}) +#else +CVT_FUNC(u16, s8, 16, + uint8x8_t v127 = vmov_n_u8(127);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint16x8_t vline1_u16 = vld1q_u16(_src + i); + uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8); + + uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16); + uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16); + vline1_u8 = vmin_u8(vline1_u8, v127); + vline2_u8 = vmin_u8(vline2_u8, v127); + + vst1q_s8(_dst + i, vcombine_s8(vreinterpret_s8_u8(vline1_u8), vreinterpret_s8_u8(vline2_u8))); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(u16, s16, 8, + register uint16x8_t v32767 asm ("q4") = vmovq_n_u16(0x7FFF);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmin.u16 q1, q0, q4 \n\t" + "vst1.16 {d2-d3}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (v32767) + : "d0","d1","d2","d3" + ); + } +}) +#else +CVT_FUNC(u16, s16, 8, + uint16x8_t v32767 = vmovq_n_u16(0x7FFF);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline_u16 = vld1q_u16(_src + i); + vline_u16 = vminq_u16(vline_u16, v32767); + vst1q_s16((_dst + i), vreinterpretq_s16_u16(vline_u16)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(u16, s32, 8, + register uint16x8_t zero0 asm ("q1") = vmovq_n_u16(0);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vst2.16 {d0,d2}, [%[dst1]] \n\t" + "vst2.16 {d1,d3}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (zero0) + : "d0","d1"//,"d2","d3"//,"d4","d5","d6","d7" + ); + } +}) +#else +CVT_FUNC(u16, s32, 8, + uint16x8x2_t vline; + vline.val[1] = vmovq_n_u16(0);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + vline.val[0] = vld1q_u16(_src + i); + vst2q_u16((uint16_t*)(_dst + i), vline); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(u16, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmovl.u16 q1, d0 \n\t" + "vmovl.u16 q2, d1 \n\t" + "vcvt.f32.u32 q3, q1 \n\t" + "vcvt.f32.u32 q4, q2 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); + } +}) +#else +CVT_FUNC(u16, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline_u16 = vld1q_u16(_src + i); + + uint32x4_t vline_u32_lo = vmovl_u16(vget_low_u16(vline_u16)); + uint32x4_t vline_u32_hi = vmovl_u16(vget_high_u16(vline_u16)); + + float32x4_t vline_f32_lo = vcvtq_f32_u32(vline_u32_lo); + float32x4_t vline_f32_hi = vcvtq_f32_u32(vline_u32_hi); + + vst1q_f32(_dst + i, vline_f32_lo); + vst1q_f32(_dst + i + 4, vline_f32_hi); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s16, u8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src1]] \n\t" + "vld1.8 {d2-d3}, [%[src2]] \n\t" + "vqmovun.s16 d4, q0 \n\t" + "vqmovun.s16 d5, q1 \n\t" + "vst1.8 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 8), + [dst] "r" (_dst + i + 0) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s16, u8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int16x8_t vline1_s16 = vld1q_s16(_src + i); + int16x8_t vline2_s16 = vld1q_s16(_src + i + 8); + + uint8x8_t vline1_u8 = vqmovun_s16(vline1_s16); + uint8x8_t vline2_u8 = vqmovun_s16(vline2_s16); + + vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8)); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s16, s8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src1]] \n\t" + "vld1.8 {d2-d3}, [%[src2]] \n\t" + "vqmovn.s16 d4, q0 \n\t" + "vqmovn.s16 d5, q1 \n\t" + "vst1.8 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 8), + [dst] "r" (_dst + i + 0) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s16, s8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int16x8_t vline1_s16 = vld1q_s16(_src + i); + int16x8_t vline2_s16 = vld1q_s16(_src + i + 8); + + int8x8_t vline1_s8 = vqmovn_s16(vline1_s16); + int8x8_t vline2_s8 = vqmovn_s16(vline2_s16); + + vst1q_s8(_dst + i, vcombine_s8(vline1_s8, vline2_s8)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(s16, u16, 8, + register int16x8_t vZero asm ("q4") = vmovq_n_s16(0);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmax.s16 q1, q0, q4 \n\t" + "vst1.16 {d2-d3}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vZero) + : "d0","d1","d2","d3" + ); + } +}) +#else +CVT_FUNC(s16, u16, 8, + int16x4_t vZero = vmov_n_s16(0);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline_s16 = vld1q_s16(_src + i); + + int16x4_t vline_s16_lo = vmax_s16(vget_low_s16(vline_s16), vZero); + int16x4_t vline_s16_hi = vmax_s16(vget_high_s16(vline_s16), vZero); + + vst1q_u16(_dst + i, vcombine_u16(vreinterpret_u16_s16(vline_s16_lo), vreinterpret_u16_s16(vline_s16_hi))); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s16, s32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmovl.s16 q1, d0 \n\t" + "vmovl.s16 q2, d1 \n\t" + "vst1.32 {d2-d3}, [%[dst1]] \n\t" + "vst1.32 {d4-d5}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s16, s32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline_s16 = vld1q_s16(_src + i); + + int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16)); + int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16)); + + vst1q_s32(_dst + i, vline_s32_lo); + vst1q_s32(_dst + i + 4, vline_s32_hi); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s16, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmovl.s16 q1, d0 \n\t" + "vmovl.s16 q2, d1 \n\t" + "vcvt.f32.s32 q3, q1 \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); + } +}) +#else +CVT_FUNC(s16, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline_s16 = vld1q_s16(_src + i); + + int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16)); + int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16)); + float32x4_t vline_f32_lo = vcvtq_f32_s32(vline_s32_lo); + float32x4_t vline_f32_hi = vcvtq_f32_s32(vline_s32_hi); + + vst1q_f32(_dst + i, vline_f32_lo); + vst1q_f32(_dst + i + 4, vline_f32_hi); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, u8, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src1]] \n\t" + "vld1.32 {d2-d3}, [%[src2]] \n\t" + "vqmovun.s32 d4, q0 \n\t" + "vqmovun.s32 d5, q1 \n\t" + "vqmovn.u16 d6, q2 \n\t" + "vst1.8 {d6}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3","d4","d5","d6" + ); + } +}) +#else +CVT_FUNC(s32, u8, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + + uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32); + uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32); + uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16)); + + vst1_u8(_dst + i, vline_u8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, s8, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src1]] \n\t" + "vld1.32 {d2-d3}, [%[src2]] \n\t" + "vqmovn.s32 d4, q0 \n\t" + "vqmovn.s32 d5, q1 \n\t" + "vqmovn.s16 d6, q2 \n\t" + "vst1.8 {d6}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3","d4","d5","d6" + ); + } +}) +#else +CVT_FUNC(s32, s8, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + + int16x4_t vline1_s16 = vqmovn_s32(vline1_s32); + int16x4_t vline2_s16 = vqmovn_s32(vline2_s32); + int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16)); + + vst1_s8(_dst + i, vline_s8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, u16, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src1]] \n\t" + "vld1.32 {d2-d3}, [%[src2]] \n\t" + "vqmovun.s32 d4, q0 \n\t" + "vqmovun.s32 d5, q1 \n\t" + "vst1.16 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s32, u16, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + + uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32); + uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32); + + vst1q_u16(_dst + i, vcombine_u16(vline1_u16, vline2_u16)); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, s16, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src1]] \n\t" + "vld1.32 {d2-d3}, [%[src2]] \n\t" + "vqmovn.s32 d4, q0 \n\t" + "vqmovn.s32 d5, q1 \n\t" + "vst1.8 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s32, s16, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + + int16x4_t vline1_s16 = vqmovn_s32(vline1_s32); + int16x4_t vline2_s16 = vqmovn_s32(vline2_s32); + + vst1q_s16(_dst + i, vcombine_s16(vline1_s16, vline2_s16)); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src]] \n\t" + "vcvt.f32.s32 q1, q0 \n\t" + "vst1.32 {d2-d3}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3"//,"d4","d5" + ); + __asm__ ( + "vld1.32 {d0-d1}, [%[src]] \n\t" + "vcvt.f32.s32 q1, q0 \n\t" + "vst1.32 {d2-d3}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i + 4), + [dst] "r" (_dst + i + 4) + : "d0","d1","d2","d3"//,"d4","d5" + ); + } +}) +#else +CVT_FUNC(s32, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline_s32 = vld1q_s32(_src + i); + float32x4_t vline_f32 = vcvtq_f32_s32(vline_s32); + vst1q_f32(_dst + i, vline_f32); + + vline_s32 = vld1q_s32(_src + i + 4); + vline_f32 = vcvtq_f32_s32(vline_s32); + vst1q_f32(_dst + i + 4, vline_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, u8, 8, + register float32x4_t vmult asm ("q0") = vdupq_n_f32((float)(1 << 16)); + register uint32x4_t vmask asm ("q1") = vdupq_n_u32(1<<16);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vcvt.u32.f32 q6, q4 \n\t" + "vcvt.u32.f32 q7, q5 \n\t" + "vbic q8, q1, q6 \n\t" + "vbic q9, q1, q7 \n\t" + "vshr.u32 q10, q8, #16 \n\t" + "vshr.u32 q11, q9, #16 \n\t" + "vqsub.u32 q12, q6, q10 \n\t" + "vqsub.u32 q13, q7, q11 \n\t" + "vqrshrn.u32 d28, q12, #16 \n\t" + "vqrshrn.u32 d29, q13, #16 \n\t" + "vqmovn.u16 d30, q14 \n\t" + "vst1.8 {d30}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vmult), "w" (vmask) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30" + ); + } +}) +#else +CVT_FUNC(f32, u8, 8, + float32x4_t vmult = vdupq_n_f32((float)(1 << 16)); + uint32x4_t vmask = vdupq_n_u32(1<<16);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + + float32x4_t vline1w_f32 = vmulq_f32(vline1_f32, vmult); + float32x4_t vline2w_f32 = vmulq_f32(vline2_f32, vmult); + + uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1w_f32); + uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2w_f32); + + uint32x4_t vl1_masked = vbicq_u32(vmask, vline1_u32); + uint32x4_t vl2_masked = vbicq_u32(vmask, vline2_u32); + uint32x4_t vl1_masked2 = vshrq_n_u32(vl1_masked, 16); + uint32x4_t vl2_masked2 = vshrq_n_u32(vl2_masked, 16); + uint32x4_t vline1r_u32 = vqsubq_u32(vline1_u32, vl1_masked2); + uint32x4_t vline2r_u32 = vqsubq_u32(vline2_u32, vl2_masked2); + + uint16x4_t vline1_u16 = vqrshrn_n_u32(vline1r_u32, 16); + uint16x4_t vline2_u16 = vqrshrn_n_u32(vline2r_u32, 16); + + uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16)); + vst1_u8(_dst + i, vline_u8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, s8, 8, + register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d2-d3}, [%[src1]] \n\t" + "vld1.32 {d4-d5}, [%[src2]] \n\t" + "vadd.f32 q3, q1, q0 \n\t" + "vadd.f32 q4, q2, q0 \n\t" + "vcvt.s32.f32 q5, q3 \n\t" + "vcvt.s32.f32 q6, q4 \n\t" + "vqmovn.s32 d14, q5 \n\t" + "vqmovn.s32 d15, q6 \n\t" + "vqmovn.s16 d16, q7 \n\t" + "vst1.8 {d16}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17" + ); + } +}) +#else +CVT_FUNC(f32, s8, 8, + float32x4_t vhalf = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + + vline1_f32 = vaddq_f32(vline1_f32, vhalf); + vline2_f32 = vaddq_f32(vline2_f32, vhalf); + + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vline1_s16 = vqmovn_s32(vline1_s32); + int16x4_t vline2_s16 = vqmovn_s32(vline2_s32); + + int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16)); + + vst1_s8(_dst + i, vline_s8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, u16, 8, + register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d2-d3}, [%[src]] \n\t" + "vadd.f32 q2, q1, q0 \n\t" + "vcvt.u32.f32 q3, q2 \n\t" + "vqmovn.u32 d8, q3 \n\t" + "vst1.16 {d8}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8" + ); + __asm__ ( + "vld1.32 {d2-d3}, [%[src]] \n\t" + "vadd.f32 q2, q1, q0 \n\t" + "vcvt.u32.f32 q3, q2 \n\t" + "vqmovn.u32 d8, q3 \n\t" + "vst1.16 {d8}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i + 4), + [dst] "r" (_dst + i + 4), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8" + ); + } +}) +#else +CVT_FUNC(f32, u16, 8, + float32x4_t vhalf = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline_f32 = vld1q_f32(_src + i); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + uint32x4_t vline_u32 = vcvtq_u32_f32(vline_f32); + uint16x4_t vline_u16 = vqmovn_u32(vline_u32); + + vst1_u16(_dst + i, vline_u16); + + vline_f32 = vld1q_f32(_src + i + 4); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + vline_u32 = vcvtq_u32_f32(vline_f32); + vline_u16 = vqmovn_u32(vline_u32); + + vst1_u16(_dst + i + 4, vline_u16); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, s16, 8, + register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d2-d3}, [%[src]] \n\t" + "vadd.f32 q2, q1, q0 \n\t" + "vcvt.s32.f32 q3, q2 \n\t" + "vqmovn.s32 d8, q3 \n\t" + "vst1.16 {d8}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8" + ); + __asm__ ( + "vld1.32 {d2-d3}, [%[src]] \n\t" + "vadd.f32 q2, q1, q0 \n\t" + "vcvt.s32.f32 q3, q2 \n\t" + "vqmovn.s32 d8, q3 \n\t" + "vst1.16 {d8}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i + 4), + [dst] "r" (_dst + i + 4), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8" + ); + } +}) +#else +CVT_FUNC(f32, s16, 8, + float32x4_t vhalf = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline_f32 = vld1q_f32(_src + i); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32); + int16x4_t vline_s16 = vqmovn_s32(vline_s32); + + vst1_s16(_dst + i, vline_s16); + + vline_f32 = vld1q_f32(_src + i + 4); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + vline_s32 = vcvtq_s32_f32(vline_f32); + vline_s16 = vqmovn_s32(vline_s32); + + vst1_s16(_dst + i + 4, vline_s16); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, s32, 8, + register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d2-d3}, [%[src1]] \n\t" + "vld1.32 {d4-d5}, [%[src2]] \n\t" + "vadd.f32 q3, q1, q0 \n\t" + "vadd.f32 q4, q2, q0 \n\t" + "vcvt.s32.f32 q5, q3 \n\t" + "vcvt.s32.f32 q6, q4 \n\t" + "vst1.32 {q5}, [%[dst1]] \n\t" + "vst1.32 {q6}, [%[dst2]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +}) +#else +CVT_FUNC(f32, s32, 8, + float32x4_t vhalf = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline_f32 = vld1q_f32(_src + i); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32); + + vst1q_s32(_dst + i, vline_s32); + + vline_f32 = vld1q_f32(_src + i + 4); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + vline_s32 = vcvtq_s32_f32(vline_f32); + + vst1q_s32(_dst + i + 4, vline_s32); + } +}) +#endif + +void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride) +{ + convert(_size, srcBase, srcStride, (u16*)dstBase, dstStride); +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/convert_depth.cpp b/3rdparty/carotene/src/convert_depth.cpp new file mode 100644 index 0000000000..21b0c18a69 --- /dev/null +++ b/3rdparty/carotene/src/convert_depth.cpp @@ -0,0 +1,399 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +void lshiftConst(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift)); + vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift)); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); + vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift)); + } + + for (; j < size.width; j++) + { + dst[j] = ((s16)src[j] << shift); + } + } +} + +template <> +void lshiftConst<0>(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + dst[j] = (s16)src[j]; + } + } +} + +template +void rshiftConst(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift), + v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift); + uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), + vqmovun_s16(v_src1)); + vst1q_u8(dst + j, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift); + vst1_u8(dst + j, vqmovun_s16(v_src)); + } + + for (; j < size.width; j++) + { + dst[j] = internal::saturate_cast((src[j] >> shift)); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift), + v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift); + int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), + vmovn_s16(v_src1)); + vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst)); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift); + vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src))); + } + + for (; j < size.width; j++) + { + dst[j] = (u8)((src[j] >> shift)); + } + } + } +} + +template <> +void rshiftConst<0>(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); + uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1)); + vst1q_u8(dst + j, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vld1q_s16(src + j); + vst1_u8(dst + j, vqmovun_s16(v_src)); + } + + for (; j < size.width; j++) + { + dst[j] = internal::saturate_cast(src[j]); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); + int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1)); + vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst)); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vld1q_s16(src + j); + vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src))); + } + + for (; j < size.width; j++) + { + dst[j] = (u8)src[j]; + } + } + } +} + +typedef void (* lshiftConstFunc)(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + +typedef void (* rshiftConstFunc)(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + +} // namespace + +#endif + +void lshift(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + u32 shift) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + if (shift >= 16u) + { + for (size_t i = 0; i < size.height; ++i) + { + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + std::memset(dst, 0, sizeof(s16) * size.width); + } + return; + } + + // this ugly contruction is needed to avoid: + // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant + // return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1); + + lshiftConstFunc funcs[16] = + { + lshiftConst<0>, + lshiftConst<1>, + lshiftConst<2>, + lshiftConst<3>, + lshiftConst<4>, + lshiftConst<5>, + lshiftConst<6>, + lshiftConst<7>, + lshiftConst<8>, + lshiftConst<9>, + lshiftConst<10>, + lshiftConst<11>, + lshiftConst<12>, + lshiftConst<13>, + lshiftConst<14>, + lshiftConst<15> + }, func = funcs[shift]; + + func(size, srcBase, srcStride, dstBase, dstStride); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)shift; +#endif +} + +void rshift(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 shift, CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + if (shift >= 16) + { + if (cpolicy == CONVERT_POLICY_WRAP) + { + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + int16x8_t v_zero = vdupq_n_s16(0); + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); + uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)), + vmovn_u16(vcltq_s16(v_src1, v_zero))); + vst1q_u8(dst + j, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vld1q_s16(src + j); + vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero))); + } + + for (; j < size.width; j++) + { + dst[j] = src[j] >= 0 ? 0 : 255; + } + } + } + else + { + for (size_t i = 0; i < size.height; ++i) + { + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + std::memset(dst, 0, sizeof(u8) * size.width); + } + } + return; + } + + // this ugly contruction is needed to avoid: + // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant + // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1); + + rshiftConstFunc funcs[16] = + { + rshiftConst<0>, + rshiftConst<1>, + rshiftConst<2>, + rshiftConst<3>, + rshiftConst<4>, + rshiftConst<5>, + rshiftConst<6>, + rshiftConst<7>, + rshiftConst<8>, + rshiftConst<9>, + rshiftConst<10>, + rshiftConst<11>, + rshiftConst<12>, + rshiftConst<13>, + rshiftConst<14>, + rshiftConst<15> + }, func = funcs[shift]; + + func(size, srcBase, srcStride, dstBase, dstStride, cpolicy); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)shift; + (void)cpolicy; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/convert_scale.cpp b/3rdparty/carotene/src/convert_scale.cpp new file mode 100644 index 0000000000..50c110b3ee --- /dev/null +++ b/3rdparty/carotene/src/convert_scale.cpp @@ -0,0 +1,2498 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ + void convertScale(const Size2D &_size, \ + const T1 * srcBase, ptrdiff_t srcStride, \ + T2 * dstBase, ptrdiff_t dstStride, \ + f64 alpha, f64 beta) \ + { \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (srcStride == dstStride && \ + srcStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + const ptrdiff_t sstep = srcStride / sizeof(T1); \ + const ptrdiff_t dstep = dstStride / sizeof(T2); \ + const size_t w = size.width & ~(SIMD_SIZE-1); \ + if (size.width >= SIMD_SIZE) \ + { \ + const T1* _src = srcBase; \ + T2* _dst = dstBase; \ + CVTINIT \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + CVTROW \ + } \ + if(w < size.width) \ + { \ + const T1* _src = srcBase; \ + T2* _dst = dstBase; \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + for(size_t i = w; i < size.width; i++ ) \ + _dst[i] = internal::saturate_cast(_src[i]*alpha + beta); \ + } \ + } + +#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \ + void convertScale(const Size2D &_size, \ + const T1 * srcBase, ptrdiff_t srcStride, \ + T1 * dstBase, ptrdiff_t dstStride, \ + f64 alpha, f64 beta) \ + { \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (srcStride == dstStride && \ + srcStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + const ptrdiff_t sstep = srcStride / sizeof(T1); \ + const ptrdiff_t dstep = dstStride / sizeof(T1); \ + const size_t w = size.width & ~(SIMD_SIZE-1); \ + if (size.width >= SIMD_SIZE) \ + { \ + const T1* _src = srcBase; \ + T1* _dst = dstBase; \ + CVTSINIT \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + CVTSROW \ + } \ + if(w < size.width) \ + { \ + const T1* _src = srcBase; \ + T1* _dst = dstBase; \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + for(size_t i = w; i < size.width; i++ ) \ + _dst[i] = internal::saturate_cast(_src[i]*alpha + beta); \ + } \ + } + +#else + +#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ + void convertScale(const Size2D &, \ + const T1 *, ptrdiff_t, \ + T2 *, ptrdiff_t, \ + f64, f64) \ + { \ + internal::assertSupportedConfiguration(); \ + } + +#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \ + void convertScale(const Size2D &, \ + const T1 *, ptrdiff_t, \ + T1 *, ptrdiff_t, \ + f64, f64) \ + { \ + internal::assertSupportedConfiguration(); \ + } + +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC1(u8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovun.s32 d22, q7 \n\t" + "vqmovun.s32 d23, q8 \n\t" + "vqmovun.s32 d24, q9 \n\t" + "vqmovun.s32 d25, q10 \n\t" + "vqmovn.u16 d26, q11 \n\t" + "vqmovn.u16 d27, q12 \n\t" + "vst1.8 {d26-d27}, [%[dst1]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC1(u8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); + uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); + vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(u8, s8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovn.s32 d22, q7 \n\t" + "vqmovn.s32 d23, q8 \n\t" + "vqmovn.s32 d24, q9 \n\t" + "vqmovn.s32 d25, q10 \n\t" + "vqmovn.s16 d26, q11 \n\t" + "vqmovn.s16 d27, q12 \n\t" + "vst1.8 {d26-d27}, [%[dst1]] \n\t" + : //no output + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, s8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); + int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); + vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(u8, u16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovun.s32 d22, q7 \n\t" + "vqmovun.s32 d23, q8 \n\t" + "vqmovun.s32 d24, q9 \n\t" + "vqmovun.s32 d25, q10 \n\t" + "vst1.16 {d22-d23}, [%[dst1]] \n\t" + "vst1.16 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, u16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32))); + vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(u8, s16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovn.s32 d22, q7 \n\t" + "vqmovn.s32 d23, q8 \n\t" + "vqmovn.s32 d24, q9 \n\t" + "vqmovn.s32 d25, q10 \n\t" + "vst1.16 {d22-d23}, [%[dst1]] \n\t" + "vst1.16 {d24-d25}, [%[dst2]] \n\t" + : //no output + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, s16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32))); + vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32))); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u8, s32, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vst1.32 {d14-d15}, [%[dst1]] \n\t" + "vst1.32 {d16-d17}, [%[dst2]] \n\t" + "vst1.32 {d18-d19}, [%[dst3]] \n\t" + "vst1.32 {d20-d21}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10", + "d11","d12","d13","d14","d15","d16","d17", + "d18","d19","d20","d21","d22","d23","d24", + "d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, s32, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + vst1q_s32(_dst + i + 8, vline3_s32); + vst1q_s32(_dst + i + 12, vline4_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u8, f32, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + "vst1.32 {d10-d11}, [%[dst3]] \n\t" + "vst1.32 {d12-d13}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10", + "d11","d12","d13","d14","d15","d16","d17", + "d18","d19","d20","d21","d22","d23","d24", + "d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, f32, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + vst1q_f32(_dst + i + 8, vline3_f32); + vst1q_f32(_dst + i + 12, vline4_f32); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(s8, u8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovun.s32 d22, q7 \n\t" + "vqmovun.s32 d23, q8 \n\t" + "vqmovun.s32 d24, q9 \n\t" + "vqmovun.s32 d25, q10 \n\t" + "vqmovn.u16 d26, q11 \n\t" + "vqmovn.u16 d27, q12 \n\t" + "vst1.8 {d26-d27}, [%[dst1]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, u8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); + uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); + vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC1(s8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovn.s32 d22, q7 \n\t" + "vqmovn.s32 d23, q8 \n\t" + "vqmovn.s32 d24, q9 \n\t" + "vqmovn.s32 d25, q10 \n\t" + "vqmovn.s16 d26, q11 \n\t" + "vqmovn.s16 d27, q12 \n\t" + "vst1.8 {d26-d27}, [%[dst1]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC1(s8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); + int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); + vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(s8, u16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovun.s32 d22, q7 \n\t" + "vqmovun.s32 d23, q8 \n\t" + "vqmovun.s32 d24, q9 \n\t" + "vqmovun.s32 d25, q10 \n\t" + "vst1.16 {d22-d23}, [%[dst1]] \n\t" + "vst1.16 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, u16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); + uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); + vst1q_u16(_dst + i + 0, vRes1_u16); + vst1q_u16(_dst + i + 8, vRes2_u16); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(s8, s16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovn.s32 d22, q7 \n\t" + "vqmovn.s32 d23, q8 \n\t" + "vqmovn.s32 d24, q9 \n\t" + "vqmovn.s32 d25, q10 \n\t" + "vst1.16 {d22-d23}, [%[dst1]] \n\t" + "vst1.16 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, s16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); + int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); + vst1q_s16(_dst + i + 0, vRes1_s16); + vst1q_s16(_dst + i + 8, vRes2_s16); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s8, s32, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vst1.32 {d14-d15}, [%[dst1]] \n\t" + "vst1.32 {d16-d17}, [%[dst2]] \n\t" + "vst1.32 {d18-d19}, [%[dst3]] \n\t" + "vst1.32 {d20-d21}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10", + "d11","d12","d13","d14","d15","d16","d17", + "d18","d19","d20","d21","d22","d23","d24", + "d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, s32, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + vst1q_s32(_dst + i + 8, vline3_s32); + vst1q_s32(_dst + i + 12, vline4_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s8, f32, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + "vst1.32 {d10-d11}, [%[dst3]] \n\t" + "vst1.32 {d12-d13}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10", + "d11","d12","d13","d14","d15","d16","d17", + "d18","d19","d20","d21","d22","d23","d24", + "d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, f32, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + vst1q_f32(_dst + i + 8, vline3_f32); + vst1q_f32(_dst + i + 12, vline4_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, u8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src1]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vqmovun.s16 d28, q13 \n\t" + "vst1.8 {d28}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" + ); + } +}) +#else +CVTS_FUNC(u16, u8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); + vst1_u8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, s8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src1]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vqmovn.s16 d28, q13 \n\t" + "vst1.8 {d28}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" + ); + } +}) +#else +CVTS_FUNC(u16, s8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); + vst1_s8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC1(u16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovun.s32 d26, q11 \n\t" + "vqmovun.s32 d27, q12 \n\t" + "vst1.16 {d26-d27}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vshift), "w" (vscale) + : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" + ); + } +}) +#else +CVTS_FUNC1(u16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovun_s32(vline1_s32); + uint16x4_t vRes2 = vqmovun_s32(vline2_s32); + vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, s16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vst1.16 {d26-d27}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vshift), "w" (vscale) + : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" + ); + } +}) +#else +CVTS_FUNC(u16, s16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, s32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vst1.32 {d22-d23}, [%[dst1]] \n\t" + "vst1.32 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (vshift), "w" (vscale) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" + ); + } +}) +#else +CVTS_FUNC(u16, s32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, f32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vst1.32 {d18-d19}, [%[dst1]] \n\t" + "vst1.32 {d20-d21}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" + ); + } +}) +#else +CVTS_FUNC(u16, f32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, u8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src1]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vqmovun.s16 d28, q13 \n\t" + "vst1.8 {d28}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" + ); + } +}) +#else +CVTS_FUNC(s16, u8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); + vst1_u8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, s8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src1]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vqmovn.s16 d28, q13 \n\t" + "vst1.8 {d28}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" + ); + } +}) +#else +CVTS_FUNC(s16, s8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); + vst1_s8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, u16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovun.s32 d26, q11 \n\t" + "vqmovun.s32 d27, q12 \n\t" + "vst1.16 {d26-d27}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" + ); + } +}) +#else +CVTS_FUNC(s16, u16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovun_s32(vline1_s32); + uint16x4_t vRes2 = vqmovun_s32(vline2_s32); + vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC1(s16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vst1.16 {d26-d27}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vshift), "w" (vscale) + : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" + ); + } +}) +#else +CVTS_FUNC1(s16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, s32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vst1.32 {d22-d23}, [%[dst1]] \n\t" + "vst1.32 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" + ); + } +}) +#else +CVTS_FUNC(s16, s32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, f32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vst1.32 {d18-d19}, [%[dst1]] \n\t" + "vst1.32 {d20-d21}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" + ); + } +}) +#else +CVTS_FUNC(s16, f32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, u8, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vqmovun.s32 d24, q10 \n\t" + "vqmovun.s32 d25, q11 \n\t" + "vqmovn.u16 d26, q12 \n\t" + "vst1.8 {d26}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26" + ); + } +}) +#else +CVTS_FUNC(s32, u8, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovun_s32(vline1_s32); + uint16x4_t vRes2 = vqmovun_s32(vline2_s32); + uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2)); + vst1_u8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, s8, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vqmovn.s32 d24, q10 \n\t" + "vqmovn.s32 d25, q11 \n\t" + "vqmovn.s16 d26, q12 \n\t" + "vst1.8 {d26}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26" + ); + } +}) +#else +CVTS_FUNC(s32, s8, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); + vst1_s8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, u16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vqmovun.s32 d24, q10 \n\t" + "vqmovun.s32 d25, q11 \n\t" + "vst1.16 {d24-d25}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" + ); + } +}) +#else +CVTS_FUNC(s32, u16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovun_s32(vline1_s32); + uint16x4_t vRes2 = vqmovun_s32(vline2_s32); + vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, s16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vqmovn.s32 d24, q10 \n\t" + "vqmovn.s32 d25, q11 \n\t" + "vst1.8 {d24-d25}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" + ); + } +}) +#else +CVTS_FUNC(s32, s16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC1(s32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vst1.32 {d20-d21}, [%[dst1]] \n\t" + "vst1.32 {d22-d23}, [%[dst2]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); + } +}) +#else +CVTS_FUNC1(s32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, f32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vst1.32 {d16-d17}, [%[dst1]] \n\t" + "vst1.32 {d18-d19}, [%[dst2]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC(s32, f32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, u8, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha)); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta)); + register uint32x4_t vmask asm ("q2") = vdupq_n_u32(1<<16);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d6-d7}, [%[src1]] \n\t" + "vld1.32 {d8-d9}, [%[src2]] \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vcvt.u32.f32 q9, q7 \n\t" + "vcvt.u32.f32 q10, q8 \n\t" + "vbic q11, q2, q6 \n\t" + "vbic q12, q2, q7 \n\t" + "vshr.u32 q13, q11, #16 \n\t" + "vshr.u32 q14, q12, #16 \n\t" + "vqsub.u32 q7, q9, q13 \n\t" + "vqsub.u32 q8, q10, q14 \n\t" + "vqrshrn.u32 d22, q7, #16 \n\t" + "vqrshrn.u32 d23, q8, #16 \n\t" + "vqmovn.u16 d30, q11 \n\t" + "vst1.8 {d30}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift), "w" (vmask) + : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30" + ); + } +}) +#else +CVTS_FUNC(f32, u8, 8, + float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha)); + float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta)); + uint32x4_t vmask = vdupq_n_u32(1<<16);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift); + float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift); + uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32); + uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32); + uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32)); + uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32)); + vline1Mask = vshrq_n_u32(vline1Mask, 16); + vline2Mask = vshrq_n_u32(vline2Mask, 16); + vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask); + vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask); + uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16); + uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16); + uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2)); + + vst1_u8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, s8, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vcvt.s32.f32 q8, q6 \n\t" + "vcvt.s32.f32 q9, q7 \n\t" + "vqmovn.s32 d14, q8 \n\t" + "vqmovn.s32 d15, q9 \n\t" + "vqmovn.s16 d16, q7 \n\t" + "vst1.8 {d16}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC(f32, s8, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); + vst1_s8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, u16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vcvt.u32.f32 q8, q6 \n\t" + "vcvt.u32.f32 q9, q7 \n\t" + "vqmovn.u32 d8, q8 \n\t" + "vqmovn.u32 d9, q9 \n\t" + "vst1.16 {d8-d9}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC(f32, u16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32); + uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovn_u32(vline1_u32); + uint16x4_t vRes2 = vqmovn_u32(vline2_u32); + vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, s16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vcvt.s32.f32 q8, q6 \n\t" + "vcvt.s32.f32 q9, q7 \n\t" + "vqmovn.s32 d8, q8 \n\t" + "vqmovn.s32 d9, q9 \n\t" + "vst1.16 {d8-d9}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC(f32, s16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, s32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vcvt.s32.f32 q4, q6 \n\t" + "vcvt.s32.f32 q5, q7 \n\t" + "vst1.32 {d8-d9}, [%[dst1]] \n\t" + "vst1.32 {d10-d11}, [%[dst2]] \n\t" + : //no output + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" + ); + } +}) +#else +CVTS_FUNC(f32, s32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC1(f32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vst1.32 {d12-d13}, [%[dst1]] \n\t" + "vst1.32 {d14-d15}, [%[dst2]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC1(f32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + } +}) +#endif + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/convolution.cpp b/3rdparty/carotene/src/convolution.cpp new file mode 100644 index 0000000000..498d7ad883 --- /dev/null +++ b/3rdparty/carotene/src/convolution.cpp @@ -0,0 +1,340 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "saturate_cast.hpp" + +namespace CAROTENE_NS { + +bool isConvolutionSupported(const Size2D &size, const Size2D &ksize, + BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 8 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE) && + (ksize.width == 3) && (ksize.height == 3); +} + +#ifdef CAROTENE_NEON + +namespace { + +template +int32x4_t vshrq_s32(int32x4_t value) +{ + return vshrq_n_s32(value, shift); +} + +template <> +int32x4_t vshrq_s32<0>(int32x4_t value) +{ + return value; +} + +} // namespace + +typedef int32x4_t (* vshrq_s32_func)(int32x4_t value); + +#endif + +void convolution(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue, + const Size2D & ksize, s16 * kernelBase, u32 scale) +{ + internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border)); +#ifdef CAROTENE_NEON + const uint8x8_t v_zero_u8 = vdup_n_u8(0); + const uint8x8_t v_border = vdup_n_u8(borderValue); + const int32x4_t v_zero_s32 = vdupq_n_s32(0); + + uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 }, + tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 }, + tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 }; + uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8; + + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + static const vshrq_s32_func vshrq_s32_a[33] = + { + vshrq_s32<0>, + vshrq_s32<1>, + vshrq_s32<2>, + vshrq_s32<3>, + vshrq_s32<4>, + vshrq_s32<5>, + vshrq_s32<6>, + vshrq_s32<7>, + vshrq_s32<8>, + vshrq_s32<9>, + vshrq_s32<10>, + vshrq_s32<11>, + vshrq_s32<12>, + vshrq_s32<13>, + vshrq_s32<14>, + vshrq_s32<15>, + vshrq_s32<16>, + vshrq_s32<17>, + vshrq_s32<18>, + vshrq_s32<19>, + vshrq_s32<20>, + vshrq_s32<21>, + vshrq_s32<22>, + vshrq_s32<23>, + vshrq_s32<24>, + vshrq_s32<25>, + vshrq_s32<26>, + vshrq_s32<27>, + vshrq_s32<28>, + vshrq_s32<29>, + vshrq_s32<30>, + vshrq_s32<31>, + vshrq_s32<32> + }; + vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale]; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + u8 prevx[3] = { 0, 0, 0 }, + currx[3] = { 0, 0, 0 }, + nextx[3] = { 0, 0, 0 }; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8); + + // perform vertical convolution + for ( ; x <= bwidth; x += 8) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 8 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx[0] = prevx[1] = prevx[2] = borderValue; + else + { + prevx[0] = srow0 ? srow0[x4] : borderValue; + prevx[1] = srow1[x4] ; + prevx[2] = srow2 ? srow2[x4] : borderValue; + } + + currx[0] = srow0 ? srow0[x3] : borderValue; + currx[1] = srow1[x3] ; + currx[2] = srow2 ? srow2[x3] : borderValue; + } + + // make shift + if (x) + { + tprev[0] = tcurr[0]; + tcurr[0] = tnext[0]; + + tprev[1] = tcurr[1]; + tcurr[1] = tnext[1]; + + tprev[2] = tcurr[2]; + tcurr[2] = tnext[2]; + } + + tnext[0] = x0; + tnext[1] = x1; + tnext[2] = x2; + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr[0] = tcurr[1] = tcurr[2] = v_border; + else if (border == BORDER_MODE_REPLICATE) + { + tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0)); + tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0)); + tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0)); + } + + continue; + } + + int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32; + + { + // combine 3 "shifted" vectors + t0 = vext_u8(tprev[0], tcurr[0], 7); + t1 = tcurr[0]; + t2 = vext_u8(tcurr[0], tnext[0], 1); + + int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2)); + + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]); + + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]); + } + + { + // combine 3 "shifted" vectors + t0 = vext_u8(tprev[1], tcurr[1], 7); + t1 = tcurr[1]; + t2 = vext_u8(tcurr[1], tnext[1], 1); + + int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2)); + + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]); + + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]); + } + + { + // combine 3 "shifted" vectors + t0 = vext_u8(tprev[2], tcurr[2], 7); + t1 = tcurr[2]; + t2 = vext_u8(tcurr[2], tnext[2], 1); + + int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2)); + + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]); + + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]); + } + + + // make scale + v_dst0 = vshrq_s32_p(v_dst0); + v_dst1 = vshrq_s32_p(v_dst1); + + // and add them + vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0), + vqmovun_s32(v_dst1)))); + } + + x -= 8; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + { + nextx[0] = borderValue; + nextx[1] = borderValue; + nextx[2] = borderValue; + } + else if (border == BORDER_MODE_REPLICATE) + { + nextx[0] = srow0[x]; + nextx[1] = srow1[x]; + nextx[2] = srow2[x]; + } + } + else + { + nextx[0] = srow0 ? srow0[x + 1] : borderValue; + nextx[1] = srow1[x + 1] ; + nextx[2] = srow2 ? srow2[x + 1] : borderValue; + } + + s32 val = 0; + for (s32 _y = 0; _y < 3; ++_y) + val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] + + currx[_y] * kernelBase[(2 - _y) * 3 + 1] + + nextx[_y] * kernelBase[(2 - _y) * 3 + 0]; + + drow[x] = internal::saturate_cast(val >> scale); + + // make shift + prevx[0] = currx[0]; + currx[0] = nextx[0]; + + prevx[1] = currx[1]; + currx[1] = nextx[1]; + + prevx[2] = currx[2]; + currx[2] = nextx[2]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; + (void)ksize; + (void)kernelBase; + (void)scale; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/count_nonzero.cpp b/3rdparty/carotene/src/count_nonzero.cpp new file mode 100644 index 0000000000..be87767cbd --- /dev/null +++ b/3rdparty/carotene/src/count_nonzero.cpp @@ -0,0 +1,430 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +s32 countNonZero(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw16 = size.width & ~15u; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + #define COUNTNONZERO8U_BLOCK_SIZE (16*255) + uint8x16_t vc1 = vmovq_n_u8(1); + for (; i < roiw16;) + { + size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16; + uint8x16_t vs = vmovq_n_u8(0); + + for (; i <= lim; i+= 16) + { + internal::prefetch(src + i); + uint8x16_t vln = vld1q_u8(src + i); + uint8x16_t vnz = vminq_u8(vln, vc1); + vs = vaddq_u8(vs, vnz); + } + + uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs)); + uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4)); + + s32 s[2]; + vst1_u32((u32*)s, vs2); + + if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros... + { + return 0x7fFFffFF; + } + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0) + { + return 0x7fFFffFF; + } + } + for (; i < size.width; i++) + result += (src[i] != 0)?1:0; + if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros... + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 countNonZero(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width & ~7u; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + #define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1)) + uint16x8_t vc1 = vmovq_n_u16(1); + for (; i < roiw8;) + { + size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8; + uint16x8_t vs = vmovq_n_u16(0); + + for (; i <= lim; i+= 8) + { + internal::prefetch(src + i); + uint16x8_t vln = vld1q_u16(src + i); + uint16x8_t vnz = vminq_u16(vln, vc1); + vs = vaddq_u16(vs, vnz); + } + + uint32x4_t vs4 = vpaddlq_u16(vs); + uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4)); + + s32 s[2]; + vst1_u32((u32*)s, vs2); + + if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros... + { + return 0x7fFFffFF; + } + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0) + { + return 0x7fFFffFF; + } + } + for (; i < size.width; i++) + result += (src[i] != 0)?1:0; + if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros... + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 countNonZero(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width & ~3u; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k); + u32 i = 0; + + uint32x4_t vc1 = vmovq_n_u32(1); + uint32x4_t vs = vmovq_n_u32(0); + + for (; i < roiw4; i += 4 ) + { + internal::prefetch(src + i); + uint32x4_t vln = vld1q_u32(src + i); + uint32x4_t vnz = vminq_u32(vln, vc1); + vs = vqaddq_u32(vs, vnz); + } + + uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + + s32 s[2]; + vst1_u32((u32*)s, vs2); + + if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros... + { + return 0x7fFFffFF; + } + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0) + { + return 0x7fFFffFF; + } + + for (; i < size.width; i++) + result += (src[i] != 0)?1:0; + if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros... + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 countNonZero(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width & ~3u; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + float32x4_t vc0 = vmovq_n_f32(0); + int32x4_t vs = vmovq_n_s32(0); + + for (; i < roiw4; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t vln = vld1q_f32(src + i); + int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0))); + vs = vqaddq_s32(vs, vnz); + } + + int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs))); + + int s[2]; + vst1_s32(s, vs2); + + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros... + { + return 0x7fFFffFF; + } + + for (; i < size.width; i++) + result += (src[i] < std::numeric_limits::min() && src[i] > -std::numeric_limits::min())?0:1; + + if (result < 0) + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 countNonZero(const Size2D &_size, + const f64 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width & ~7u; + size_t roiw4 = size.width & ~3u; + size_t roiw2 = size.width & ~1u; + uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero + uint32x4_t vc0 = vmovq_n_u32(0); + + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f64* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + int32x2_t vs1 = vmov_n_s32(0); + int32x2_t vs2 = vmov_n_s32(0); + int32x2_t vs3 = vmov_n_s32(0); + int32x2_t vs4 = vmov_n_s32(0); + + for (; i < roiw8; i += 8 ) + { + internal::prefetch(src + i + 6); + uint64x2_t vln1 = vld1q_u64((const u64*)(src + i)); + uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2)); + uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4)); + uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6)); + + uint64x2_t vm1 = vandq_u64(vln1, vmask1); + uint64x2_t vm2 = vandq_u64(vln2, vmask1); + uint64x2_t vm3 = vandq_u64(vln3, vmask1); + uint64x2_t vm4 = vandq_u64(vln4, vmask1); + + uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0); + uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0); + uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0); + uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0); + + uint32x4_t vlx1 = vmvnq_u32(vequ1); + uint32x4_t vlx2 = vmvnq_u32(vequ2); + uint32x4_t vlx3 = vmvnq_u32(vequ3); + uint32x4_t vlx4 = vmvnq_u32(vequ4); + + int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1))); + int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2))); + int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3))); + int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4))); + + vs1 = vqadd_s32(vs1, vnz1); + vs2 = vqadd_s32(vs2, vnz2); + vs3 = vqadd_s32(vs3, vnz3); + vs4 = vqadd_s32(vs4, vnz4); + } + + if (i < roiw4) + { + internal::prefetch(src + i + 2); + uint64x2_t vln1 = vld1q_u64((const u64*)(src + i)); + uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2)); + + uint64x2_t vm1 = vandq_u64(vln1, vmask1); + uint64x2_t vm2 = vandq_u64(vln2, vmask1); + + uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0); + uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0); + + uint32x4_t vlx1 = vmvnq_u32(vequ1); + uint32x4_t vlx2 = vmvnq_u32(vequ2); + + int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1))); + int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2))); + + vs1 = vqadd_s32(vs1, vnz1); + vs2 = vqadd_s32(vs2, vnz2); + i += 4; + } + + if (i < roiw2) + { + internal::prefetch(src + i); + uint64x2_t vln1 = vld1q_u64((const u64*)(src + i)); + + uint64x2_t vm1 = vandq_u64(vln1, vmask1); + + uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0); + + uint32x4_t vlx1 = vmvnq_u32(vequ1); + + int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1))); + + vs1 = vqadd_s32(vs1, vnz1); + i += 2; + } + + vs1 = vqadd_s32(vs1, vs2); + vs3 = vqadd_s32(vs3, vs4); + vs1 = vqadd_s32(vs1, vs3); + int32x2_t vsneg = vqneg_s32(vs1); + + s32 s[2]; + vst1_s32(s, vsneg); + + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros... + { + return 0x7fFFffFF; + } + + for (; i < size.width; i++) + result += (src[i] < std::numeric_limits::min() && src[i] > -std::numeric_limits::min())?0:1; + if (result < 0) + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/div.cpp b/3rdparty/carotene/src/div.cpp new file mode 100644 index 0000000000..9c03202a83 --- /dev/null +++ b/3rdparty/carotene/src/div.cpp @@ -0,0 +1,694 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2016, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include +#include +#include +#include + +namespace CAROTENE_NS { + +namespace { + +#ifdef CAROTENE_NEON + +template +inline T divSaturateQ(const T &v1, const T &v2, const float scale) +{ + return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)), + internal::vmovl(internal::vget_low(v2)), scale)), + internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)), + internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t divSaturateQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); } +template <> +inline uint32x4_t divSaturateQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); } + +template +inline T divSaturate(const T &v1, const T &v2, const float scale) +{ + return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t divSaturate(const int32x2_t &v1, const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); } +template <> +inline uint32x2_t divSaturate(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); } + + +template +inline T divWrapQ(const T &v1, const T &v2, const float scale) +{ + return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)), + internal::vmovl(internal::vget_low(v2)), scale)), + internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)), + internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t divWrapQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); } +template <> +inline uint32x4_t divWrapQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); } + +template +inline T divWrap(const T &v1, const T &v2, const float scale) +{ + return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t divWrap(const int32x2_t &v1, const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); } +template <> +inline uint32x2_t divWrap(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); } + +inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); } +inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); } +inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); } +inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); } +inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); } +inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); } + +inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); } +inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); } +inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); } +inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); } +inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); } +inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); } +#endif + +template +void div(const Size2D &size, + const T * src0Base, ptrdiff_t src0Stride, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::vec64 vec64; + + if (scale == 0.0f || + (std::numeric_limits::is_integer && + (scale * std::numeric_limits::max()) < 1.0f && + (scale * std::numeric_limits::max()) > -1.0f)) + { + for (size_t y = 0; y < size.height; ++y) + { + T * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(T) * size.width); + } + return; + } + + const size_t step128 = 16 / sizeof(T); + size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0; + const size_t step64 = 8 / sizeof(T); + size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const T * src1 = internal::getRowPtr(src1Base, src1Stride, i); + T * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + vec128 v_src0 = internal::vld1q(src0 + j); + vec128 v_src1 = internal::vld1q(src1 + j); + + vec128 v_mask = vtstq(v_src1,v_src1); + internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale))); + } + for (; j < roiw64; j += step64) + { + vec64 v_src0 = internal::vld1(src0 + j); + vec64 v_src1 = internal::vld1(src1 + j); + + vec64 v_mask = vtst(v_src1,v_src1); + internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale))); + } + for (; j < size.width; j++) + { + dst[j] = src1[j] ? internal::saturate_cast(scale * src0[j] / src1[j]) : 0; + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + vec128 v_src0 = internal::vld1q(src0 + j); + vec128 v_src1 = internal::vld1q(src1 + j); + + vec128 v_mask = vtstq(v_src1,v_src1); + internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale))); + } + for (; j < roiw64; j += step64) + { + vec64 v_src0 = internal::vld1(src0 + j); + vec64 v_src1 = internal::vld1(src1 + j); + + vec64 v_mask = vtst(v_src1,v_src1); + internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale))); + } + for (; j < size.width; j++) + { + dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0; + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +#ifdef CAROTENE_NEON + +template +inline T recipSaturateQ(const T &v2, const float scale) +{ + return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)), + internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t recipSaturateQ(const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); } +template <> +inline uint32x4_t recipSaturateQ(const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); } + +template +inline T recipSaturate(const T &v2, const float scale) +{ + return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t recipSaturate(const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); } +template <> +inline uint32x2_t recipSaturate(const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); } + + +template +inline T recipWrapQ(const T &v2, const float scale) +{ + return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)), + internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t recipWrapQ(const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); } +template <> +inline uint32x4_t recipWrapQ(const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); } + +template +inline T recipWrap(const T &v2, const float scale) +{ + return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t recipWrap(const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); } +template <> +inline uint32x2_t recipWrap(const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); } +#endif + +template +void recip(const Size2D &size, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::vec64 vec64; + + if (scale == 0.0f || + (std::numeric_limits::is_integer && + scale < 1.0f && + scale > -1.0f)) + { + for (size_t y = 0; y < size.height; ++y) + { + T * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(T) * size.width); + } + return; + } + + const size_t step128 = 16 / sizeof(T); + size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0; + const size_t step64 = 8 / sizeof(T); + size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src1 = internal::getRowPtr(src1Base, src1Stride, i); + T * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src1 + j); + + vec128 v_src1 = internal::vld1q(src1 + j); + + vec128 v_mask = vtstq(v_src1,v_src1); + internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale))); + } + for (; j < roiw64; j += step64) + { + vec64 v_src1 = internal::vld1(src1 + j); + + vec64 v_mask = vtst(v_src1,v_src1); + internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale))); + } + for (; j < size.width; j++) + { + dst[j] = src1[j] ? internal::saturate_cast(scale / src1[j]) : 0; + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src1 + j); + + vec128 v_src1 = internal::vld1q(src1 + j); + + vec128 v_mask = vtstq(v_src1,v_src1); + internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale))); + } + for (; j < roiw64; j += step64) + { + vec64 v_src1 = internal::vld1(src1 + j); + + vec64 v_mask = vtst(v_src1,v_src1); + internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale))); + } + for (; j < size.width; j++) + { + dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0; + } + } + } +#else + (void)size; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +} + +void div(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (scale == 0.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + f32 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(f32) * size.width); + } + return; + } + + float32x4_t v_zero = vdupq_n_f32(0.0f); + + size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; + size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; + + if (std::fabs(scale - 1.0f) < FLT_EPSILON) + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + float32x4_t v_src0 = vld1q_f32(src0 + j); + float32x4_t v_src1 = vld1q_f32(src1 + j); + + uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); + vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask))); + } + + for (; j < roiw64; j += 2) + { + float32x2_t v_src0 = vld1_f32(src0 + j); + float32x2_t v_src1 = vld1_f32(src1 + j); + + uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); + vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( + vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask))); + } + + for (; j < size.width; j++) + { + dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f; + } + } + } + else + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + float32x4_t v_src0 = vld1q_f32(src0 + j); + float32x4_t v_src1 = vld1q_f32(src1 + j); + + uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); + vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale), + internal::vrecpq_f32(v_src1))), v_mask))); + } + + for (; j < roiw64; j += 2) + { + float32x2_t v_src0 = vld1_f32(src0 + j); + float32x2_t v_src1 = vld1_f32(src1 + j); + + uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); + vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( + vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale), + internal::vrecp_f32(v_src1))), v_mask))); + } + + for (; j < size.width; j++) + { + dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f; + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)scale; +#endif +} + +void reciprocal(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (scale == 0.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + f32 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(f32) * size.width); + } + return; + } + + float32x4_t v_zero = vdupq_n_f32(0.0f); + + size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; + size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; + + if (std::fabs(scale - 1.0f) < FLT_EPSILON) + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src1 + j); + + float32x4_t v_src1 = vld1q_f32(src1 + j); + + uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); + vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask))); + } + + for (; j < roiw64; j += 2) + { + float32x2_t v_src1 = vld1_f32(src1 + j); + + uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); + vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( + vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask))); + } + + for (; j < size.width; j++) + { + dst[j] = src1[j] ? 1.0f / src1[j] : 0; + } + } + } + else + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src1 + j); + + float32x4_t v_src1 = vld1q_f32(src1 + j); + + uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); + vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1), + scale)),v_mask))); + } + + for (; j < roiw64; j += 2) + { + float32x2_t v_src1 = vld1_f32(src1 + j); + + uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); + vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( + vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1), + scale)), v_mask))); + } + + for (; j < size.width; j++) + { + dst[j] = src1[j] ? scale / src1[j] : 0; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)scale; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/dot_product.cpp b/3rdparty/carotene/src/dot_product.cpp new file mode 100644 index 0000000000..1759ea7cd5 --- /dev/null +++ b/3rdparty/carotene/src/dot_product.cpp @@ -0,0 +1,260 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +f64 dotProduct(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + +// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow +// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements +#define DOT_UINT_BLOCKSIZE 66050*8 + f64 result = 0.0; + for (size_t row = 0; row < size.height; ++row) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); + + size_t i = 0; + uint64x2_t ws = vmovq_n_u64(0); + + while(i + 16 <= size.width) + { + size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; + + uint32x4_t s1 = vmovq_n_u32(0); + uint32x4_t s2 = vmovq_n_u32(0); + + for (; i <= lim; i += 16) + { + internal::prefetch(src0 + i); + internal::prefetch(src1 + i); + + uint8x16_t vs1 = vld1q_u8(src0 + i); + uint8x16_t vs2 = vld1q_u8(src1 + i); + + uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2)); + uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2)); + + s1 = vpadalq_u16(s1, vdot1); + s2 = vpadalq_u16(s2, vdot2); + } + + ws = vpadalq_u32(ws, s1); + ws = vpadalq_u32(ws, s2); + } + + if(i + 8 <= size.width) + { + uint8x8_t vs1 = vld1_u8(src0 + i); + uint8x8_t vs2 = vld1_u8(src1 + i); + + ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2))); + i += 8; + } + + result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0); + + for (; i < size.width; ++i) + result += s32(src0[i]) * s32(src1[i]); + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f64 dotProduct(const Size2D &_size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + +// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow +// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements +#define DOT_INT_BLOCKSIZE 131070*8 + f64 result = 0.0; + for (size_t row = 0; row < size.height; ++row) + { + const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); + const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); + + size_t i = 0; + int64x2_t ws = vmovq_n_s64(0); + + while(i + 16 <= size.width) + { + size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; + + int32x4_t s1 = vmovq_n_s32(0); + int32x4_t s2 = vmovq_n_s32(0); + + for (; i <= lim; i += 16) + { + internal::prefetch(src0 + i); + internal::prefetch(src1 + i); + + int8x16_t vs1 = vld1q_s8(src0 + i); + int8x16_t vs2 = vld1q_s8(src1 + i); + + int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2)); + int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2)); + + s1 = vpadalq_s16(s1, vdot1); + s2 = vpadalq_s16(s2, vdot2); + } + + ws = vpadalq_s32(ws, s1); + ws = vpadalq_s32(ws, s2); + } + + if(i + 8 <= size.width) + { + int8x8_t vs1 = vld1_s8(src0 + i); + int8x8_t vs2 = vld1_s8(src1 + i); + + ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2))); + i += 8; + } + + result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0); + + for (; i < size.width; ++i) + result += s32(src0[i]) * s32(src1[i]); + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f64 dotProduct(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width * sizeof(f32))) + { + size.width *= size.height; + size.height = 1; + } + +#define DOT_FLOAT_BLOCKSIZE (1 << 13) + f64 result = 0.0; + for (size_t row = 0; row < size.height; ++row) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row); + + size_t i = 0; + while(i + 4 <= size.width) + { + size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4; + float32x4_t v_sum = vdupq_n_f32(0.0f); + + for( ; i <= lim; i += 4 ) + { + internal::prefetch(src0 + i); + internal::prefetch(src1 + i); + v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i)); + } + + float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum)); + result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); + } + + if(i + 2 <= size.width) + { + float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i)); + result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); + i += 2; + } + + for (; i < size.width; ++i) + result += src0[i] * src1[i]; + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/fast.cpp b/3rdparty/carotene/src/fast.cpp new file mode 100644 index 0000000000..9506c1b6be --- /dev/null +++ b/3rdparty/carotene/src/fast.cpp @@ -0,0 +1,428 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + + +/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten. + Below is the original copyright and the references */ + +/* +Copyright (c) 2006, 2008 Edward Rosten +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + *Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + *Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + *Neither the name of the University of Cambridge nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* +The references are: + * Machine learning for high-speed corner detection, + E. Rosten and T. Drummond, ECCV 2006 + * Faster and better: A machine learning approach to corner detection + E. Rosten, R. Porter and T. Drummond, PAMI, 2009 +*/ + +#include "common.hpp" + +#include +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON +namespace +{ + +void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride) +{ + pixel[0] = 0 + row_stride * 3; + pixel[1] = 1 + row_stride * 3; + pixel[2] = 2 + row_stride * 2; + pixel[3] = 3 + row_stride * 1; + pixel[4] = 3 + row_stride * 0; + pixel[5] = 3 + row_stride * -1; + pixel[6] = 2 + row_stride * -2; + pixel[7] = 1 + row_stride * -3; + pixel[8] = 0 + row_stride * -3; + pixel[9] = -1 + row_stride * -3; + pixel[10] = -2 + row_stride * -2; + pixel[11] = -3 + row_stride * -1; + pixel[12] = -3 + row_stride * 0; + pixel[13] = -3 + row_stride * 1; + pixel[14] = -2 + row_stride * 2; + pixel[15] = -1 + row_stride * 3; +} + +u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[]) +{ + const s32 K = 8, N = 16 + K + 1; + s32 k, v = ptr[0]; + s16 d[(N + 7) & ~7]; + for( k = 0; k < N; k++ ) + d[k] = (s16)(v - ptr[pixel[k]]); + + int16x8_t q0 = vdupq_n_s16((s16)(-1000)); + int16x8_t q1 = vdupq_n_s16((s16)(1000)); + + int16x8_t d0_7 = vld1q_s16(d + 0); + int16x8_t d8_15 = vld1q_s16(d + 8); + int16x8_t d16_23 = vld1q_s16(d + 16); + int16x8_t d24 = vld1q_s16(d + 24); + + //k == 0 + int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1); + int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2); + int16x8_t ak0 = vminq_s16(v0k0, v1k0); + int16x8_t bk0 = vmaxq_s16(v0k0, v1k0); + + v0k0 = vextq_s16(d0_7, d8_15, 3); + ak0 = vminq_s16(ak0, v0k0); + bk0 = vmaxq_s16(bk0, v0k0); + + v1k0 = vextq_s16(d0_7, d8_15, 4); + ak0 = vminq_s16(ak0, v1k0); + bk0 = vmaxq_s16(bk0, v1k0); + + v0k0 = vextq_s16(d0_7, d8_15, 5); + ak0 = vminq_s16(ak0, v0k0); + bk0 = vmaxq_s16(bk0, v0k0); + + v1k0 = vextq_s16(d0_7, d8_15, 6); + ak0 = vminq_s16(ak0, v1k0); + bk0 = vmaxq_s16(bk0, v1k0); + + v0k0 = vextq_s16(d0_7, d8_15, 7); + ak0 = vminq_s16(ak0, v0k0); + bk0 = vmaxq_s16(bk0, v0k0); + + ak0 = vminq_s16(ak0, d8_15); + bk0 = vmaxq_s16(bk0, d8_15); + + q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7)); + q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7)); + + v1k0 = vextq_s16(d8_15, d16_23, 1); + q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0)); + q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0)); + + //k == 8 + int16x8_t v0k8 = v1k0; + int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2); + int16x8_t ak8 = vminq_s16(v0k8, v1k8); + int16x8_t bk8 = vmaxq_s16(v0k8, v1k8); + + v0k8 = vextq_s16(d8_15, d16_23, 3); + ak8 = vminq_s16(ak8, v0k8); + bk8 = vmaxq_s16(bk8, v0k8); + + v1k8 = vextq_s16(d8_15, d16_23, 4); + ak8 = vminq_s16(ak8, v1k8); + bk8 = vmaxq_s16(bk8, v1k8); + + v0k8 = vextq_s16(d8_15, d16_23, 5); + ak8 = vminq_s16(ak8, v0k8); + bk8 = vmaxq_s16(bk8, v0k8); + + v1k8 = vextq_s16(d8_15, d16_23, 6); + ak8 = vminq_s16(ak8, v1k8); + bk8 = vmaxq_s16(bk8, v1k8); + + v0k8 = vextq_s16(d8_15, d16_23, 7); + ak8 = vminq_s16(ak8, v0k8); + bk8 = vmaxq_s16(bk8, v0k8); + + ak8 = vminq_s16(ak8, d16_23); + bk8 = vmaxq_s16(bk8, d16_23); + + q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15)); + q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15)); + + v1k8 = vextq_s16(d16_23, d24, 1); + q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8)); + q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8)); + + //fin + int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1)); + int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q)); + int32x4_t q2w = vmovl_s16(q2); + int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w)); + int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32))); + + return (u8)(vget_lane_s32(q8, 0) - 1); +} + +} //namespace +#endif + +void FAST(const Size2D &size, + u8 *srcBase, ptrdiff_t srcStride, + KeypointStore *keypoints, + u8 threshold, bool nonmax_suppression) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + //keypoints.clear(); + + const s32 K = 8, N = 16 + K + 1; + ptrdiff_t i, j, k, pixel[N]; + makeOffsets(pixel, srcStride); + for(k = 16; k < N; k++) + pixel[k] = pixel[k - 16]; + + uint8x16_t delta = vdupq_n_u8(128); + uint8x16_t t = vdupq_n_u8(threshold); + uint8x16_t K16 = vdupq_n_u8((u8)K); + + u8 threshold_tab[512]; + for( i = -255; i <= 255; i++ ) + threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0); + + std::vector _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128); + u8* buf[3]; + buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width; + ptrdiff_t* cpbuf[3]; + cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1; + cpbuf[1] = cpbuf[0] + size.width + 1; + cpbuf[2] = cpbuf[1] + size.width + 1; + memset(buf[0], 0, size.width*3); + + for(i = 3; i < (ptrdiff_t)size.height-2; i++) + { + const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3; + u8* curr = buf[(i - 3)%3]; + ptrdiff_t* cornerpos = cpbuf[(i - 3)%3]; + memset(curr, 0, size.width); + ptrdiff_t ncorners = 0; + + if( i < (ptrdiff_t)size.height - 3 ) + { + j = 3; + + for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16) + { + internal::prefetch(ptr); + internal::prefetch(ptr + pixel[0]); + internal::prefetch(ptr + pixel[2]); + + uint8x16_t v0 = vld1q_u8(ptr); + int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta)); + int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta)); + + int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta)); + int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta)); + int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta)); + int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta)); + + uint8x16_t m0 = vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2)); + uint8x16_t m1 = vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1)); + m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2))); + m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2))); + m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2))); + m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3))); + m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2))); + m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0))); + m0 = vorrq_u8(m0, m1); + + u64 mask[2]; + vst1q_u64(mask, vreinterpretq_u64_u8(m0)); + + if( mask[0] == 0 ) + { + if (mask[1] != 0) + { + j -= 8; + ptr -= 8; + } + continue; + } + + uint8x16_t c0 = vmovq_n_u8(0); + uint8x16_t c1 = vmovq_n_u8(0); + uint8x16_t max0 = vmovq_n_u8(0); + uint8x16_t max1 = vmovq_n_u8(0); + for( k = 0; k < N; k++ ) + { + int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta)); + m0 = vcgtq_s8(x, v2); + m1 = vcgtq_s8(v1, x); + + c0 = vandq_u8(vsubq_u8(c0, m0), m0); + c1 = vandq_u8(vsubq_u8(c1, m1), m1); + + max0 = vmaxq_u8(max0, c0); + max1 = vmaxq_u8(max1, c1); + } + + max0 = vmaxq_u8(max0, max1); + u8 m[16]; + vst1q_u8(m, vcgtq_u8(max0, K16)); + + for( k = 0; k < 16; ++k ) + if(m[k]) + { + cornerpos[ncorners++] = j+k; + if(nonmax_suppression) + curr[j+k] = cornerScore(ptr+k, pixel); + } + } + + for( ; j < (s32)size.width - 3; j++, ptr++ ) + { + s32 v = ptr[0]; + const u8* tab = &threshold_tab[0] - v + 255; + s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]]; + + if( d == 0 ) + continue; + + d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]]; + d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]]; + d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]]; + + if( d == 0 ) + continue; + + d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]]; + d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]]; + d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]]; + d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]]; + + if( d & 1 ) + { + s32 vt = v - threshold, count = 0; + + for( k = 0; k < N; k++ ) + { + s32 x = ptr[pixel[k]]; + if(x < vt) + { + if( ++count > K ) + { + cornerpos[ncorners++] = j; + if(nonmax_suppression) + curr[j] = cornerScore(ptr, pixel); + break; + } + } + else + count = 0; + } + } + + if( d & 2 ) + { + s32 vt = v + threshold, count = 0; + + for( k = 0; k < N; k++ ) + { + s32 x = ptr[pixel[k]]; + if(x > vt) + { + if( ++count > K ) + { + cornerpos[ncorners++] = j; + if(nonmax_suppression) + curr[j] = cornerScore(ptr, pixel); + break; + } + } + else + count = 0; + } + } + } + } + + cornerpos[-1] = ncorners; + + if( i == 3 ) + continue; + + const u8* prev = buf[(i - 4 + 3)%3]; + const u8* pprev = buf[(i - 5 + 3)%3]; + cornerpos = cpbuf[(i - 4 + 3)%3]; + ncorners = cornerpos[-1]; + + for( k = 0; k < ncorners; k++ ) + { + j = cornerpos[k]; + s32 score = prev[j]; + if( !nonmax_suppression || + (score > prev[j+1] && score > prev[j-1] && + score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] && + score > curr[j-1] && score > curr[j] && score > curr[j+1]) ) + { + keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score); + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)keypoints; + (void)threshold; + (void)nonmax_suppression; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/fill_minmaxloc.cpp b/3rdparty/carotene/src/fill_minmaxloc.cpp new file mode 100644 index 0000000000..fdf0e35d03 --- /dev/null +++ b/3rdparty/carotene/src/fill_minmaxloc.cpp @@ -0,0 +1,442 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +void process(const T * src, size_t j0, size_t j1, size_t i, + T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + for (size_t j = j0; j < j1; ++j) + { + T val = src[j]; + + if (val == maxVal) + { + if (maxLocCount < maxLocCapacity) + { + maxLocPtr[maxLocCount] = j; + maxLocPtr[maxLocCount + 1] = i; + } + maxLocCount += 2; + } + + if (val == minVal) + { + if (minLocCount < minLocCapacity) + { + minLocPtr[minLocCount] = j; + minLocPtr[minLocCount + 1] = i; + } + minLocCount += 2; + } + } +} + +} // namespace + +#endif + +void fillMinMaxLocs(const Size2D & size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal); + uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal); + + u64 mask[2] = { 0ul }; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint8x16_t v_src = vld1q_u8(src + j); + + uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16); + uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16); + uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask); + + vst1q_u8((u8 *)&mask[0], v_mask); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + if (mask[1]) + process(src, j + 8, j + 16, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + for ( ; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + + uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8); + uint8x8_t v_minmask = vceq_u8(v_src, v_minval8); + uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask); + + vst1_u8((u8 *)&mask[0], v_mask); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +void fillMinMaxLocs(const Size2D & size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint16x8_t v_maxval8 = vdupq_n_u16(maxVal), + v_minval8 = vdupq_n_u16(minVal); + u64 mask[2] = { 0ul }; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const u16 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); + + uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8)); + uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8)); + + vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1))); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + if (mask[1]) + process(src, j + 8, j + 16, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + for ( ; j < roiw8; j += 8) + { + internal::prefetch(src + j); + uint16x8_t v_src = vld1q_u16(src + j); + + uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8); + uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8); + uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask); + + vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask)); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +void fillMinMaxLocs(const Size2D & size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + int16x8_t v_maxval8 = vdupq_n_s16(maxVal), + v_minval8 = vdupq_n_s16(minVal); + u64 mask[2] = { 0ul }; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); + + uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8)); + uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8)); + + vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1))); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + if (mask[1]) + process(src, j + 8, j + 16, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + for ( ; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int16x8_t v_src = vld1q_s16(src + j); + + uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8); + uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8); + uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask); + + vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask)); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +void fillMinMaxLocs(const Size2D & size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + int32x4_t v_maxval4 = vdupq_n_s32(maxVal), + v_minval4 = vdupq_n_s32(minVal); + u64 mask = 0ul; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const s32 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4); + + uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4)); + uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4)); + + vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1)))); + + if (mask) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +void fillMinMaxLocs(const Size2D & size, + const u32 * srcBase, ptrdiff_t srcStride, + u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint32x4_t v_maxval4 = vdupq_n_u32(maxVal), + v_minval4 = vdupq_n_u32(minVal); + u64 mask = 0ul; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const u32 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw8; j += 8) + { + internal::prefetch(src + j); + uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4); + + uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4)); + uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4)); + + vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1)))); + + if (mask) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/flip.cpp b/3rdparty/carotene/src/flip.cpp new file mode 100644 index 0000000000..339398dd92 --- /dev/null +++ b/3rdparty/carotene/src/flip.cpp @@ -0,0 +1,222 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include + +namespace CAROTENE_NS { + +bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize) +{ + bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4); + return isSupportedConfiguration() && + ((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) || + (flipMode == FLIP_VERTICAL_MODE)); +} + +#ifdef CAROTENE_NEON + +namespace { + +template +void flip(const Size2D & size, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode) +{ + using namespace internal; + + typedef typename VecTraits::vec128 vec128; + typedef typename VecTraits::vec64 vec64; + + u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T); + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; + size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src = getRowPtr((const T *)srcBase, srcStride, i); + T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i); + size_t js = 0, jd = size.width; + + for (; js < roiw_base; js += step_base, jd -= step_base) + { + prefetch(src + js); + + vec128 v_src = vld1q(src + js); + vec128 v_dst = vrev64q(v_src); + v_dst = vcombine(vget_high(v_dst), vget_low(v_dst)); + vst1q(dst + jd - step_base, v_dst); + } + for (; js < roiw_tail; js += step_tail, jd -= step_tail) + { + vec64 v_src = vld1(src + js); + vst1(dst + jd - step_tail, vrev64(v_src)); + } + + for (--jd; js < size.width; ++js, --jd) + dst[jd] = src[js]; + } +} + +template +void flip3(const Size2D & size, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode) +{ + using namespace internal; + +#ifndef ANDROID + typedef typename VecTraits::vec128 vec128; +#endif + typedef typename VecTraits::vec64 vec64; + +#ifndef ANDROID + u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3; + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; +#endif + u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3; + size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src = getRowPtr((const T *)srcBase, srcStride, i); + T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i); + size_t j = 0, js = 0, jd = size.width * 3; + +#ifndef ANDROID + for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3) + { + prefetch(src + js); + + vec128 v_src = vld3q(src + js), v_dst; + v_src.val[0] = vrev64q(v_src.val[0]); + v_src.val[1] = vrev64q(v_src.val[1]); + v_src.val[2] = vrev64q(v_src.val[2]); + + v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0])); + v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1])); + v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2])); + + vst3q(dst + jd - step_base3, v_dst); + } +#endif // ANDROID + + for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3) + { + vec64 v_src = vld3(src + js), v_dst; + v_dst.val[0] = vrev64(v_src.val[0]); + v_dst.val[1] = vrev64(v_src.val[1]); + v_dst.val[2] = vrev64(v_src.val[2]); + + vst3(dst + jd - step_tail3, v_dst); + } + + for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3) + { + dst[jd] = src[js]; + dst[jd + 1] = src[js + 1]; + dst[jd + 2] = src[js + 2]; + } + } +} + +typedef void (* flipFunc)(const Size2D &size, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode); + +} // namespace + +#endif + +void flip(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode, u32 elemSize) +{ + internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize)); +#ifdef CAROTENE_NEON + + if (flipMode == FLIP_VERTICAL_MODE) + { + for (size_t y = 0; y < size.height; ++y) + { + const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1); + + std::memcpy(dst_row, src_row, elemSize * size.width); + } + return; + } + + flipFunc func = NULL; + + if (elemSize == (u32)sizeof(u8)) + func = &flip; + if (elemSize == (u32)sizeof(u16)) + func = &flip; + if (elemSize == (u32)sizeof(u32)) + func = &flip; + if (elemSize == (u32)sizeof(u8) * 3) + func = &flip3; + + if (func == NULL) + return; + + func(size, + srcBase, srcStride, + dstBase, dstStride, + flipMode); + +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)flipMode; + (void)elemSize; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/gaussian_blur.cpp b/3rdparty/carotene/src/gaussian_blur.cpp new file mode 100644 index 0000000000..069373e419 --- /dev/null +++ b/3rdparty/carotene/src/gaussian_blur.cpp @@ -0,0 +1,1059 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "saturate_cast.hpp" +#include "separable_filter.hpp" + +namespace CAROTENE_NS { + +bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 8 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE); +} + +void gaussianBlur3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border)); +#ifdef CAROTENE_NEON + const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2); + const uint16x8_t v_zero = vdupq_n_u16(0); + const uint8x8_t v_border = vdup_n_u8(borderValue); + + uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero; + uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero; + + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + s16 prevx = 0, currx = 0, nextx = 0; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8); + + // perform vertical convolution + for ( ; x <= bwidth; x += 8) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 8 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx = borderValue; + else + prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue); + + currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue); + } + + // make shift + if (x) + { + tprev = tcurr; + tcurr = tnext; + } + + // and calculate next value + tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1)); + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr = v_border_x4; + else if (border == BORDER_MODE_REPLICATE) + tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0)); + + continue; + } + + // combine 3 "shifted" vectors + t0 = vextq_u16(tprev, tcurr, 7); + t1 = tcurr; + t2 = vextq_u16(tcurr, tnext, 1); + + // and add them + t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2)); + vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4)); + } + + x -= 8; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + nextx = borderValue << 2; + else if (border == BORDER_MODE_REPLICATE) + nextx = srow2[x] + (srow1[x] << 1) + srow0[x]; + } + else + nextx = (srow2 ? srow2[x + 1] : borderValue) + + (srow1[x + 1] << 1) + + (srow0 ? srow0[x + 1] : borderValue); + + f32 val = (prevx + (currx << 1) + nextx) >> 4; + drow[x] = internal::saturate_cast((s32)val); + + // make shift + prevx = currx; + currx = nextx; + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin) +{ + return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin); +} + +void gaussianBlur3x3Margin(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin)); +#ifdef CAROTENE_NEON + internal::sepFilter3x3::process( + size, srcBase, srcStride, dstBase, dstStride, + 0, 0, border, borderValue, borderMargin); +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border) +{ + return isSupportedConfiguration() && + cn > 0 && cn <= 4 && + size.width >= 8 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REPLICATE || + border == BORDER_MODE_WRAP); +} + +void gaussianBlur5x5(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + u8 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 4*cn, borderValue); + tmp = &_tmp[cn << 1]; + } + + ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //1-line buffer + std::vector _buf(cn * (size.width + 4) + 32 / sizeof(u16)); + u16* lane = internal::alignPtr(&_buf[cn << 1], 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = borderValue; + lane[-cn-cn+k] = borderValue; + lane[colsn+k] = borderValue; + lane[colsn+cn+k] = borderValue; + } + + uint8x8_t vc6u8 = vmov_n_u8(6); + uint16x8_t vc6u16 = vmovq_n_u16(6); + uint16x8_t vc4u16 = vmovq_n_u16(4); + + for (size_t i = 0; i < size.height; ++i) + { + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp; + const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i); + const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 8; x += 8) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + uint8x8_t v0 = vld1_u8(ln0+x); + uint8x8_t v1 = vld1_u8(ln1+x); + uint8x8_t v2 = vld1_u8(ln2+x); + uint8x8_t v3 = vld1_u8(ln3+x); + uint8x8_t v4 = vld1_u8(ln4+x); + + uint16x8_t v = vaddl_u8(v0, v4); + uint16x8_t v13 = vaddl_u8(v1, v3); + + v = vmlal_u8(v, v2, vc6u8); + v = vmlaq_u16(v, v13, vc4u16); + + vst1q_u16(lane + x, v); + } + for (; x < colsn; ++x) + lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x]; + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = lane[idx_l1 + k]; + lane[-cn-cn+k] = lane[idx_l2 + k]; + + lane[colsn+k] = lane[idx_r1 + k]; + lane[colsn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + for (; x <= colsn - 8; x += 8) + { + internal::prefetch(lane + x); + + uint16x8_t lane0 = vld1q_u16(lane + x - 2); + uint16x8_t lane4 = vld1q_u16(lane + x + 2); + uint16x8_t lane1 = vld1q_u16(lane + x - 1); + uint16x8_t lane3 = vld1q_u16(lane + x + 1); + uint16x8_t lane2 = vld1q_u16(lane + x + 0); + + uint16x8_t ln04 = vaddq_u16(lane0, lane4); + uint16x8_t ln13 = vaddq_u16(lane1, lane3); + + uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16); + uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16); + + uint8x8_t ls = vrshrn_n_u16(lsw, 8); + + vst1_u8(dst + x, ls); + } + break; + case 2: + for (; x <= colsn - 8*2; x += 8*2) + { + internal::prefetch(lane + x); + + u16* lidx0 = lane + x - 2*2; + u16* lidx1 = lane + x - 1*2; + u16* lidx3 = lane + x + 1*2; + u16* lidx4 = lane + x + 2*2; +#if __GNUC_MINOR__ < 7 + __asm__ __volatile__ ( + "vld2.16 {d0, d2}, [%[in0]]! \n\t" + "vld2.16 {d1, d3}, [%[in0]] \n\t" + "vld2.16 {d8, d10}, [%[in4]]! \n\t" + "vld2.16 {d9, d11}, [%[in4]] \n\t" + "vadd.i16 q0, q4 \n\t" + "vadd.i16 q1, q5 \n\t" + "vld2.16 {d16, d18}, [%[in1]]! \n\t" + "vld2.16 {d17, d19}, [%[in1]] \n\t" + "vld2.16 {d8, d10}, [%[in3]]! \n\t" + "vld2.16 {d9, d11}, [%[in3]] \n\t" + "vadd.i16 q4, q8 \n\t" + "vadd.i16 q5, q9 \n\t" + "vld2.16 {d16, d18}, [%[in2]] \n\t" + "vld2.16 {d17, d19}, [%[in22]] \n\t" + "vmla.i16 q0, q4, %q[c4] \n\t" + "vmla.i16 q1, q5, %q[c4] \n\t" + "vmla.i16 q0, q8, %q[c6] \n\t" + "vmla.i16 q1, q9, %q[c6] \n\t" + "vrshrn.u16 d8, q0, #8 \n\t" + "vrshrn.u16 d9, q1, #8 \n\t" + "vst2.8 {d8-d9}, [%[out]] \n\t" + : [in0] "=r" (lidx0), + [in1] "=r" (lidx1), + [in3] "=r" (lidx3), + [in4] "=r" (lidx4) + : [out] "r" (dst + x), + "0" (lidx0), + "1" (lidx1), + "2" (lidx3), + "3" (lidx4), + [in2] "r" (lane + x), + [in22] "r" (lane + x + 4*2), + [c4] "w" (vc4u16), [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + uint16x8x2_t vLane0 = vld2q_u16(lidx0); + uint16x8x2_t vLane1 = vld2q_u16(lidx1); + uint16x8x2_t vLane2 = vld2q_u16(lane + x); + uint16x8x2_t vLane3 = vld2q_u16(lidx3); + uint16x8x2_t vLane4 = vld2q_u16(lidx4); + + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); + uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]); + + uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]); + uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16); + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16); + + uint8x8x2_t vRes; + vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8); + vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8); + vst2_u8(dst + x, vRes); +#endif + } + break; + case 3: + for (; x <= colsn - 8*3; x += 8*3) + { + internal::prefetch(lane + x); + + u16* lidx0 = lane + x - 2*3; + u16* lidx1 = lane + x - 1*3; + u16* lidx3 = lane + x + 1*3; + u16* lidx4 = lane + x + 2*3; +#if defined(__GNUC__) && defined(__arm__) + __asm__ __volatile__ ( + "vld3.16 {d0, d2, d4}, [%[in0]]! \n\t" + "vld3.16 {d1, d3, d5}, [%[in0]] \n\t" + "vld3.16 {d8, d10, d12}, [%[in4]]! \n\t" + "vld3.16 {d9, d11, d13}, [%[in4]] \n\t" + "vadd.i16 q0, q4 \n\t" + "vadd.i16 q1, q5 \n\t" + "vadd.i16 q2, q6 \n\t" + "vld3.16 {d16, d18, d20}, [%[in1]]! \n\t" + "vld3.16 {d17, d19, d21}, [%[in1]] \n\t" + "vld3.16 {d8, d10, d12}, [%[in3]]! \n\t" + "vld3.16 {d9, d11, d13}, [%[in3]] \n\t" + "vadd.i16 q4, q8 \n\t" + "vadd.i16 q5, q9 \n\t" + "vadd.i16 q6, q10 \n\t" + "vld3.16 {d16, d18, d20}, [%[in2]] \n\t" + "vld3.16 {d17, d19, d21}, [%[in22]] \n\t" + "vmla.i16 q0, q4, %q[c4] \n\t" + "vmla.i16 q1, q5, %q[c4] \n\t" + "vmla.i16 q2, q6, %q[c4] \n\t" + "vmla.i16 q0, q8, %q[c6] \n\t" + "vmla.i16 q1, q9, %q[c6] \n\t" + "vmla.i16 q2, q10, %q[c6] \n\t" + "vrshrn.u16 d8, q0, #8 \n\t" + "vrshrn.u16 d9, q1, #8 \n\t" + "vrshrn.u16 d10, q2, #8 \n\t" + "vst3.8 {d8-d10}, [%[out]] \n\t" + : [in0] "=r" (lidx0), + [in1] "=r" (lidx1), + [in3] "=r" (lidx3), + [in4] "=r" (lidx4) + : [out] "r" (dst + x), + "0" (lidx0), + "1" (lidx1), + "2" (lidx3), + "3" (lidx4), + [in2] "r" (lane + x), + [in22] "r" (lane + x + 4*3), + [c4] "w" (vc4u16), [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + uint16x8x3_t vLane0 = vld3q_u16(lidx0); + uint16x8x3_t vLane1 = vld3q_u16(lidx1); + uint16x8x3_t vLane2 = vld3q_u16(lane + x); + uint16x8x3_t vLane3 = vld3q_u16(lidx3); + uint16x8x3_t vLane4 = vld3q_u16(lidx4); + + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); + uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]); + uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]); + + uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]); + uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]); + uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16); + vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16); + vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16); + + uint8x8x3_t vRes; + vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8); + vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8); + vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8); + + vst3_u8(dst + x, vRes); +#endif + } + break; + case 4: + for (; x <= colsn - 8*4; x += 8*4) + { + internal::prefetch(lane + x); + internal::prefetch(lane + x + 16); + + u16* lidx0 = lane + x - 2*4; + u16* lidx1 = lane + x - 1*4; + u16* lidx3 = lane + x + 1*4; + u16* lidx4 = lane + x + 2*4; +#if defined(__GNUC__) && defined(__arm__) + __asm__ __volatile__ ( + "vld4.16 {d0, d2, d4, d6}, [%[in0]]! \n\t" + "vld4.16 {d1, d3, d5, d7}, [%[in0]] \n\t" + "vld4.16 {d8, d10, d12, d14}, [%[in4]]! \n\t" + "vld4.16 {d9, d11, d13, d15}, [%[in4]] \n\t" + "vadd.i16 q0, q4 \n\t" + "vadd.i16 q1, q5 \n\t" + "vadd.i16 q2, q6 \n\t" + "vadd.i16 q3, q7 \n\t" + "vld4.16 {d16, d18, d20, d22}, [%[in1]]! \n\t" + "vld4.16 {d17, d19, d21, d23}, [%[in1]] \n\t" + "vld4.16 {d8, d10, d12, d14}, [%[in3]]! \n\t" + "vld4.16 {d9, d11, d13, d15}, [%[in3]] \n\t" + "vadd.i16 q4, q8 \n\t" + "vadd.i16 q5, q9 \n\t" + "vadd.i16 q6, q10 \n\t" + "vadd.i16 q7, q11 \n\t" + "vld4.16 {d16, d18, d20, d22}, [%[in2],:256] \n\t" + "vld4.16 {d17, d19, d21, d23}, [%[in22],:256] \n\t" + "vmla.i16 q0, q4, %q[c4] \n\t" + "vmla.i16 q1, q5, %q[c4] \n\t" + "vmla.i16 q2, q6, %q[c4] \n\t" + "vmla.i16 q3, q7, %q[c4] \n\t" + "vmla.i16 q0, q8, %q[c6] \n\t" + "vmla.i16 q1, q9, %q[c6] \n\t" + "vmla.i16 q2, q10, %q[c6] \n\t" + "vmla.i16 q3, q11, %q[c6] \n\t" + "vrshrn.u16 d8, q0, #8 \n\t" + "vrshrn.u16 d9, q1, #8 \n\t" + "vrshrn.u16 d10, q2, #8 \n\t" + "vrshrn.u16 d11, q3, #8 \n\t" + "vst4.8 {d8-d11}, [%[out]] \n\t" + : [in0] "=r" (lidx0), + [in1] "=r" (lidx1), + [in3] "=r" (lidx3), + [in4] "=r" (lidx4) + : [out] "r" (dst + x), + "0" (lidx0), + "1" (lidx1), + "2" (lidx3), + "3" (lidx4), + [in2] "r" (lane + x), + [in22] "r" (lane + x + 4*4), + [c4] "w" (vc4u16), [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + uint16x8x4_t vLane0 = vld4q_u16(lidx0); + uint16x8x4_t vLane2 = vld4q_u16(lidx4); + uint16x8x4_t vLane4 = vld4q_u16(lidx1); + uint16x8x4_t vLane6 = vld4q_u16(lidx3); + uint16x8x4_t vLane8 = vld4q_u16(lane + x); + + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane2.val[0]); + uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane2.val[1]); + uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane2.val[2]); + uint16x8_t vSum_3_7 = vaddq_u16(vLane0.val[3], vLane2.val[3]); + + uint16x8_t vSum_4_8 = vaddq_u16(vLane4.val[0], vLane6.val[0]); + uint16x8_t vSum_5_9 = vaddq_u16(vLane4.val[1], vLane6.val[1]); + uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]); + uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16); + vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16); + vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16); + vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16); + vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16); + + uint8x8x4_t vRes; + vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8); + vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8); + vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8); + vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8); + + vst4_u8(dst + x, vRes); +#endif + } + break; + } + for (s32 h = 0; h < cn; ++h) + { + u16* ln = lane + h; + u8* dt = dst + h; + for (size_t k = x; k < colsn; k += cn) + { + dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn] + + u16(4) * (ln[k-cn] + ln[k+cn]) + + u16(6) * ln[k] + (1 << 7)) >> 8); + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +void gaussianBlur5x5(const Size2D &size, s32 cn, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u16 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + u16 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 4*cn, borderValue); + tmp = &_tmp[cn << 1]; + } + + ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //1-line buffer + std::vector _buf(cn * (size.width + 4) + 32 / sizeof(u32)); + u32* lane = internal::alignPtr(&_buf[cn << 1], 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = borderValue; + lane[-cn-cn+k] = borderValue; + lane[colsn+k] = borderValue; + lane[colsn+cn+k] = borderValue; + } + + uint16x4_t vc6u16 = vmov_n_u16(6); + uint32x4_t vc6u32 = vmovq_n_u32(6); + uint32x4_t vc4u32 = vmovq_n_u32(4); + + for (size_t i = 0; i < size.height; ++i) + { + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp; + const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i); + const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + uint16x4_t v0 = vld1_u16(ln0+x); + uint16x4_t v1 = vld1_u16(ln1+x); + uint16x4_t v2 = vld1_u16(ln2+x); + uint16x4_t v3 = vld1_u16(ln3+x); + uint16x4_t v4 = vld1_u16(ln4+x); + + uint32x4_t v = vaddl_u16(v0, v4); + uint32x4_t v13 = vaddl_u16(v1, v3); + + v = vmlal_u16(v, v2, vc6u16); + v = vmlaq_u32(v, v13, vc4u32); + + vst1q_u32(lane + x, v); + } + for (; x < colsn; ++x) + lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x]; + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = lane[idx_l1 + k]; + lane[-cn-cn+k] = lane[idx_l2 + k]; + + lane[colsn+k] = lane[idx_r1 + k]; + lane[colsn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(lane + x); + + uint32x4_t lane0 = vld1q_u32(lane + x - 2); + uint32x4_t lane4 = vld1q_u32(lane + x + 2); + uint32x4_t lane1 = vld1q_u32(lane + x - 1); + uint32x4_t lane3 = vld1q_u32(lane + x + 1); + uint32x4_t lane2 = vld1q_u32(lane + x + 0); + + uint32x4_t ln04 = vaddq_u32(lane0, lane4); + uint32x4_t ln13 = vaddq_u32(lane1, lane3); + + uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32); + uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32); + + uint16x4_t ls = vrshrn_n_u32(lsw, 8); + + vst1_u16(dst + x, ls); + } + for (s32 h = 0; h < cn; ++h) + { + u32* ln = lane + h; + u16* dt = dst + h; + for (size_t k = x; k < colsn; k += cn) + { + dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8); + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +void gaussianBlur5x5(const Size2D &size, s32 cn, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s16 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + s16 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 4*cn, borderValue); + tmp = &_tmp[cn << 1]; + } + + ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //1-line buffer + std::vector _buf(cn * (size.width + 4) + 32 / sizeof(s32)); + s32* lane = internal::alignPtr(&_buf[cn << 1], 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = borderValue; + lane[-cn-cn+k] = borderValue; + lane[colsn+k] = borderValue; + lane[colsn+cn+k] = borderValue; + } + + int16x4_t vc6s16 = vmov_n_s16(6); + int32x4_t vc6s32 = vmovq_n_s32(6); + int32x4_t vc4s32 = vmovq_n_s32(4); + + for (size_t i = 0; i < size.height; ++i) + { + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp; + const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i); + const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + int16x4_t v0 = vld1_s16(ln0+x); + int16x4_t v1 = vld1_s16(ln1+x); + int16x4_t v2 = vld1_s16(ln2+x); + int16x4_t v3 = vld1_s16(ln3+x); + int16x4_t v4 = vld1_s16(ln4+x); + + int32x4_t v = vaddl_s16(v0, v4); + int32x4_t v13 = vaddl_s16(v1, v3); + + v = vmlal_s16(v, v2, vc6s16); + v = vmlaq_s32(v, v13, vc4s32); + + vst1q_s32(lane + x, v); + } + for (; x < colsn; ++x) + lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x]; + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = lane[idx_l1 + k]; + lane[-cn-cn+k] = lane[idx_l2 + k]; + + lane[colsn+k] = lane[idx_r1 + k]; + lane[colsn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + case 2: + case 3: + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(lane + x); + + int32x4_t lane0 = vld1q_s32(lane + x - 2); + int32x4_t lane4 = vld1q_s32(lane + x + 2); + int32x4_t lane1 = vld1q_s32(lane + x - 1); + int32x4_t lane3 = vld1q_s32(lane + x + 1); + int32x4_t lane2 = vld1q_s32(lane + x + 0); + + int32x4_t ln04 = vaddq_s32(lane0, lane4); + int32x4_t ln13 = vaddq_s32(lane1, lane3); + + int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32); + int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32); + + int16x4_t ls = vrshrn_n_s32(lsw, 8); + + vst1_s16(dst + x, ls); + } + break; + case 4: +/* for (; x <= colsn - 4*4; x += 4*4) + { + internal::prefetch(lane + x); + internal::prefetch(lane + x + 16); + + ptrdiff_t* lidx0 = lane + x - 2*4; + ptrdiff_t* lidx1 = lane + x - 1*4; + ptrdiff_t* lidx3 = lane + x + 1*4; + ptrdiff_t* lidx4 = lane + x + 2*4; + + __asm__ __volatile__ ( + "vld4.32 {d0, d2, d4, d6}, [%[in0]]! \n\t" + "vld4.32 {d1, d3, d5, d7}, [%[in0]] \n\t" + "vld4.32 {d8, d10, d12, d14}, [%[in4]]! \n\t" + "vld4.32 {d9, d11, d13, d15}, [%[in4]] \n\t" + "vadd.i32 q0, q4 \n\t" + "vadd.i32 q1, q5 \n\t" + "vadd.i32 q2, q6 \n\t" + "vadd.i32 q3, q7 \n\t" + "vld4.32 {d16, d18, d20, d22}, [%[in1]]! \n\t" + "vld4.32 {d17, d19, d21, d23}, [%[in1]] \n\t" + "vld4.32 {d8, d10, d12, d14}, [%[in3]]! \n\t" + "vld4.32 {d9, d11, d13, d15}, [%[in3]] \n\t" + "vadd.i32 q4, q8 \n\t" + "vadd.i32 q5, q9 \n\t" + "vadd.i32 q6, q10 \n\t" + "vadd.i32 q7, q11 \n\t" + "vld4.32 {d16, d18, d20, d22}, [%[in2],:256] \n\t" + "vld4.32 {d17, d19, d21, d23}, [%[in22],:256] \n\t" + "vmla.i32 q0, q4, %q[c4] \n\t" + "vmla.i32 q1, q5, %q[c4] \n\t" + "vmla.i32 q2, q6, %q[c4] \n\t" + "vmla.i32 q3, q7, %q[c4] \n\t" + "vmla.i32 q0, q8, %q[c6] \n\t" + "vmla.i32 q1, q9, %q[c6] \n\t" + "vmla.i32 q2, q10, %q[c6] \n\t" + "vmla.i32 q3, q11, %q[c6] \n\t" + "vrshrn.i32 d8, q0, #8 \n\t" + "vrshrn.i32 d9, q1, #8 \n\t" + "vrshrn.i32 d10, q2, #8 \n\t" + "vrshrn.i32 d11, q3, #8 \n\t" + "vst4.16 {d8-d11}, [%[out]] \n\t" + : [in0] "=r" (lidx0), + [in1] "=r" (lidx1), + [in3] "=r" (lidx3), + [in4] "=r" (lidx4) + : [out] "r" (dst + x), + "0" (lidx0), + "1" (lidx1), + "2" (lidx3), + "3" (lidx4), + [in2] "r" (lane + x), + [in22] "r" (lane + x + 4*2), + [c4] "w" (vc4s32), [c6] "w" (vc6s32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +*/ + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(lane + x); + + int32x4_t lane0 = vld1q_s32(lane + x - 2); + int32x4_t lane4 = vld1q_s32(lane + x + 2); + int32x4_t lane1 = vld1q_s32(lane + x - 1); + int32x4_t lane3 = vld1q_s32(lane + x + 1); + int32x4_t lane2 = vld1q_s32(lane + x + 0); + + int32x4_t ln04 = vaddq_s32(lane0, lane4); + int32x4_t ln13 = vaddq_s32(lane1, lane3); + + int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32); + int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32); + + int16x4_t ls = vrshrn_n_s32(lsw, 8); + + vst1_s16(dst + x, ls); + } + break; + } + for (s32 h = 0; h < cn; ++h) + { + s32* ln = lane + h; + s16* dt = dst + h; + for (size_t k = x; k < colsn; k += cn) + { + dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8); + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +void gaussianBlur5x5(const Size2D &size, s32 cn, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s32 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + s32 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 4*cn, borderValue); + tmp = &_tmp[cn << 1]; + } + + ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //1-line buffer + std::vector _buf(cn * (size.width + 4) + 32 / sizeof(s32)); + s32* lane = internal::alignPtr(&_buf[cn << 1], 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = borderValue; + lane[-cn-cn+k] = borderValue; + lane[colsn+k] = borderValue; + lane[colsn+cn+k] = borderValue; + } + + int32x4_t vc6s32 = vmovq_n_s32(6); + int32x4_t vc4s32 = vmovq_n_s32(4); + + for (size_t i = 0; i < size.height; ++i) + { + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp; + const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i); + const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + int32x4_t v0 = vld1q_s32(ln0+x); + int32x4_t v1 = vld1q_s32(ln1+x); + int32x4_t v2 = vld1q_s32(ln2+x); + int32x4_t v3 = vld1q_s32(ln3+x); + int32x4_t v4 = vld1q_s32(ln4+x); + + int32x4_t v = vaddq_s32(v0, v4); + int32x4_t v13 = vaddq_s32(v1, v3); + + v = vmlaq_s32(v, v2, vc6s32); + v = vmlaq_s32(v, v13, vc4s32); + + vst1q_s32(lane + x, v); + } + for (; x < colsn; ++x) + lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x]; + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = lane[idx_l1 + k]; + lane[-cn-cn+k] = lane[idx_l2 + k]; + + lane[colsn+k] = lane[idx_r1 + k]; + lane[colsn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(lane + x); + + int32x4_t lane0 = vld1q_s32(lane + x - 2); + int32x4_t lane4 = vld1q_s32(lane + x + 2); + int32x4_t lane1 = vld1q_s32(lane + x - 1); + int32x4_t lane3 = vld1q_s32(lane + x + 1); + int32x4_t lane2 = vld1q_s32(lane + x + 0); + + int32x4_t ln04 = vaddq_s32(lane0, lane4); + int32x4_t ln13 = vaddq_s32(lane1, lane3); + + int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32); + int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32); + + vst1q_s32(dst + x, lsw); + } + for (s32 h = 0; h < cn; ++h) + { + s32* ln = lane + h; + s32* dt = dst + h; + for (size_t k = x; k < colsn; k += cn) + { + dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k]; + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/in_range.cpp b/3rdparty/carotene/src/in_range.cpp new file mode 100644 index 0000000000..b79a237e39 --- /dev/null +++ b/3rdparty/carotene/src/in_range.cpp @@ -0,0 +1,195 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); } +inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); } +inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); } + +template struct vtail +{ + static inline void inRange(const T *, const T *, const T *, + u8 *, size_t &, size_t) + { + //do nothing since there couldn't be enough data + } +}; +template struct vtail +{ + static inline void inRange(const T * src, const T * rng1, const T * rng2, + u8 * dst, size_t &x, size_t width) + { + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + //There no more than 15 elements in the tail, so we could handle 8 element vector only once + if( x + 8 < width) + { + vec128 vs = internal::vld1q( src + x); + vec128 vr1 = internal::vld1q(rng1 + x); + vec128 vr2 = internal::vld1q(rng2 + x); + uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs)); + internal::vst1(dst + x, internal::vmovn(vd)); + x+=8; + } + } +}; +template struct vtail +{ + static inline void inRange(const T * src, const T * rng1, const T * rng2, + u8 * dst, size_t &x, size_t width) + { + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + typedef typename internal::VecTraits::vec64 vec64; + typedef typename internal::VecTraits::unsign::vec64 uvec64; + //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements + if( x + 16 < width) + { + vec128 vs = internal::vld1q( src + x); + vec128 vr1 = internal::vld1q(rng1 + x); + vec128 vr2 = internal::vld1q(rng2 + x); + uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs)); + internal::vst1q(dst + x, vd); + x+=16; + } + if( x + 8 < width) + { + vec64 vs = internal::vld1( src + x); + vec64 vr1 = internal::vld1(rng1 + x); + vec64 vr2 = internal::vld1(rng2 + x); + uvec64 vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs)); + internal::vst1(dst + x, vd); + x+=8; + } + } +}; + +template +inline void inRangeCheck(const Size2D &_size, + const T * srcBase, ptrdiff_t srcStride, + const T * rng1Base, ptrdiff_t rng1Stride, + const T * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride) +{ + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + + Size2D size(_size); + if (srcStride == dstStride && + srcStride == rng1Stride && + srcStride == rng2Stride && + srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + const size_t width = size.width & ~( 32/sizeof(T) - 1 ); + + for(size_t j = 0; j < size.height; ++j) + { + const T * src = internal::getRowPtr( srcBase, srcStride, j); + const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j); + const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j); + u8 * dst = internal::getRowPtr( dstBase, dstStride, j); + size_t i = 0; + for( ; i < width; i += 32/sizeof(T) ) + { + internal::prefetch(src + i); + internal::prefetch(rng1 + i); + internal::prefetch(rng2 + i); + + vec128 vs = internal::vld1q( src + i); + vec128 vr1 = internal::vld1q(rng1 + i); + vec128 vr2 = internal::vld1q(rng2 + i); + uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs)); + vs = internal::vld1q( src + i + 16/sizeof(T)); + vr1 = internal::vld1q(rng1 + i + 16/sizeof(T)); + vr2 = internal::vld1q(rng2 + i + 16/sizeof(T)); + uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs)); + vnst(dst + i, vd1, vd2); + } + vtail::inRange(src, rng1, rng2, dst, i, size.width); + for( ; i < size.width; i++ ) + dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i])); + } +} + +} + +#define INRANGEFUNC(T) \ +void inRange(const Size2D &_size, \ + const T * srcBase, ptrdiff_t srcStride, \ + const T * rng1Base, ptrdiff_t rng1Stride, \ + const T * rng2Base, ptrdiff_t rng2Stride, \ + u8 * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + inRangeCheck(_size, srcBase, srcStride, \ + rng1Base, rng1Stride, rng2Base, rng2Stride, \ + dstBase, dstStride); \ +} +#else +#define INRANGEFUNC(T) \ +void inRange(const Size2D &, \ + const T *, ptrdiff_t, \ + const T *, ptrdiff_t, \ + const T *, ptrdiff_t, \ + u8 *, ptrdiff_t) \ +{ \ + internal::assertSupportedConfiguration(); \ +} +#endif + +INRANGEFUNC(u8) +INRANGEFUNC(s8) +INRANGEFUNC(u16) +INRANGEFUNC(s16) +INRANGEFUNC(s32) +INRANGEFUNC(f32) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/integral.cpp b/3rdparty/carotene/src/integral.cpp new file mode 100644 index 0000000000..56c919500e --- /dev/null +++ b/3rdparty/carotene/src/integral.cpp @@ -0,0 +1,238 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +void integral(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u32 * sumBase, ptrdiff_t sumStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint32x4_t v_zero = vmovq_n_u32(0u); + + // the first iteration + const u8 * src = internal::getRowPtr(srcBase, srcStride, 0); + u32 * sum = internal::getRowPtr(sumBase, sumStride, 0); + + uint32x4_t prev = v_zero; + size_t j = 0u; + + for ( ; j + 7 < size.width; j += 8) + { + internal::prefetch(sum + j); + internal::prefetch(src + j); + + uint8x8_t el8shr0 = vld1_u8(src + j); + uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8)); + uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16)); + uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24)); + + uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2); + uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3); + + uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03); + uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8)); + + uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8)); + uint32x4_t vsumh = vaddw_u16(prev, el4h); + + vst1q_u32(sum + j, vsuml); + vst1q_u32(sum + j + 4, vsumh); + + prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3)); + } + + for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j) + sum[j] = (v += src[j]); + + // the others + for (size_t i = 1; i < size.height ; ++i) + { + src = internal::getRowPtr(srcBase, srcStride, i); + u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1); + sum = internal::getRowPtr(sumBase, sumStride, i); + + prev = v_zero; + j = 0u; + + for ( ; j + 7 < size.width; j += 8) + { + internal::prefetch(sum + j); + internal::prefetch(src + j); + + uint32x4_t vsuml = vld1q_u32(prevSum + j); + uint32x4_t vsumh = vld1q_u32(prevSum + j + 4); + + uint8x8_t el8shr0 = vld1_u8(src + j); + uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8)); + uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16)); + uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24)); + + vsuml = vaddq_u32(vsuml, prev); + vsumh = vaddq_u32(vsumh, prev); + + uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2); + uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3); + + uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03); + uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8)); + + vsuml = vaddw_u16(vsuml, vget_low_u16(el8)); + vsumh = vaddw_u16(vsumh, el4h); + + vst1q_u32(sum + j, vsuml); + vst1q_u32(sum + j + 4, vsumh); + + prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3)); + } + + for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j) + sum[j] = (v += src[j]) + prevSum[j]; + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)sumBase; + (void)sumStride; +#endif +} + +void sqrIntegral(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + f64 * sqsumBase, ptrdiff_t sqsumStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t v_zero8 = vmovq_n_u16(0u); + + // the first iteration + const u8 * src = internal::getRowPtr(srcBase, srcStride, 0); + f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0); + + double prev = 0.; + size_t j = 0u; + + for ( ; j + 7 < size.width; j += 8) + { + internal::prefetch(sqsum + j); + internal::prefetch(src + j); + + uint8x8_t vsrc = vld1_u8(src + j); + + uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc); + uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7); + + uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1)); + uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1)); + + uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h); + + uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l)); + uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l)); + uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h)); + + u32 buf[8]; + vst1_u32(buf, vget_low_u32(el8shr01l)); + vst1_u32(buf+2, el2l); + vst1_u32(buf+4, el2hl); + vst1_u32(buf+6, el2hh); + for(u32 k=0; k < 8; k++) + sqsum[j+k] = prev + buf[k]; + prev += buf[7]; + } + + for (; j < size.width; ++j) + sqsum[j] = (prev += src[j]*src[j]); + + // the others + for (size_t i = 1; i < size.height ; ++i) + { + src = internal::getRowPtr(srcBase, srcStride, i); + f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1); + sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i); + + prev = 0.; + j = 0u; + + for ( ; j + 7 < size.width; j += 8) + { + internal::prefetch(sqsum + j); + internal::prefetch(src + j); + + uint8x8_t vsrc = vld1_u8(src + j); + + uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc); + uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7); + + uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1)); + uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1)); + + uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h); + + uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l)); + uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l)); + uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h)); + + u32 buf[8]; + vst1_u32(buf, vget_low_u32(el8shr01l)); + vst1_u32(buf+2, el2l); + vst1_u32(buf+4, el2hl); + vst1_u32(buf+6, el2hh); + for(u32 k=0; k < 8; k++) + sqsum[j+k] = prev + prevSqSum[j+k] + buf[k]; + prev += buf[7]; + } + + for (; j < size.width; ++j) + sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j]; + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)sqsumBase; + (void)sqsumStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/intrinsics.hpp b/3rdparty/carotene/src/intrinsics.hpp new file mode 100644 index 0000000000..062a3f897b --- /dev/null +++ b/3rdparty/carotene/src/intrinsics.hpp @@ -0,0 +1,112 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_INTRINSICS_HPP +#define CAROTENE_INTRINSICS_HPP + +#include + +#include + +namespace CAROTENE_NS { namespace internal { + +/////////////// Custom NEON intrinsics /////////////////// + +// calculate reciprocal value + +inline float32x4_t vrecpq_f32(float32x4_t val) +{ + float32x4_t reciprocal = vrecpeq_f32(val); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +inline float32x2_t vrecp_f32(float32x2_t val) +{ + float32x2_t reciprocal = vrecpe_f32(val); + reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); + reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +// caclulate sqrt value + +inline float32x4_t vrsqrtq_f32(float32x4_t val) +{ + float32x4_t e = vrsqrteq_f32(val); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); + return e; +} + +inline float32x2_t vrsqrt_f32(float32x2_t val) +{ + float32x2_t e = vrsqrte_f32(val); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); + return e; +} + +inline float32x4_t vsqrtq_f32(float32x4_t val) +{ + return vrecpq_f32(vrsqrtq_f32(val)); +} + +inline float32x2_t vsqrt_f32(float32x2_t val) +{ + return vrecp_f32(vrsqrt_f32(val)); +} + +// table lookup with the table in a 128-bit register + +inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b) +{ +#ifdef __aarch64__ + // AArch64 supports this natively + return ::vqtbl1_u8(a, b); +#else + union { uint8x16_t v; uint8x8x2_t w; } u = { a }; + return vtbl2_u8(u.w, b); +#endif +} + +} } + +#endif diff --git a/3rdparty/carotene/src/laplacian.cpp b/3rdparty/carotene/src/laplacian.cpp new file mode 100644 index 0000000000..b9148de1b4 --- /dev/null +++ b/3rdparty/carotene/src/laplacian.cpp @@ -0,0 +1,713 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "saturate_cast.hpp" + +#include + +namespace CAROTENE_NS { + +bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 8 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE); +} + +void Laplacian3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border)); +#ifdef CAROTENE_NEON + const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3); + const uint16x8_t v_zero = vdupq_n_u16(0); + const uint8x8_t v_border = vdup_n_u8(borderValue); + + uint8x8_t vsub; + uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero; + uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero; + + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + s16 prevx = 0, currx = 0, nextx = 0; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8); + + // perform vertical convolution + for ( ; x <= bwidth; x += 8) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 8 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx = borderValue; + else + prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue); + + currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue); + } + + // make shift + if (x) + { + tprev = tcurr; + tcurr = tnext; + } + + // and calculate next value + tnext = vaddw_u8(vaddl_u8(x0, x1), x2); + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr = v_border_x3; + else if (border == BORDER_MODE_REPLICATE) + tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0)); + + vsub = x1; + + continue; + } + + // combine 3 "shifted" vectors + t0 = vextq_u16(tprev, tcurr, 7); + t1 = tcurr; + t2 = vextq_u16(tcurr, tnext, 1); + + // and add them + t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2)); + + int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0), + vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub))); + uint8x8_t it0 = vqmovun_s16(tt0); + vst1_u8(drow + x - 8, it0); + + vsub = x1; + } + + x -= 8; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + nextx = borderValue * 3; + else if (border == BORDER_MODE_REPLICATE) + nextx = srow2[x] + srow1[x] + srow0[x]; + } + else + { + nextx = (srow2 ? srow2[x + 1] : borderValue) + + srow1[x + 1] + + (srow0 ? srow0[x + 1] : borderValue); + } + + s32 val = (prevx + currx + nextx) - 9 * srow1[x]; + drow[x] = internal::saturate_cast((s32)val); + + // make shift + prevx = currx; + currx = nextx; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && + size.width >= 8 && size.height >= 1 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REPLICATE); +} + +void Laplacian1OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border)); +#ifdef CAROTENE_NEON + ptrdiff_t rows = size.height, cols = size.width; + + std::vector _tmp; + u8 *tmp = 0; + if (border == BORDER_MODE_CONSTANT) + { + _tmp.assign(cols + 4,borderValue); + tmp = &_tmp[2]; + } + + for( ptrdiff_t y = 0; y < rows; y++ ) + { + const u8* v0 = 0; + const u8* v1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* v2 = 0; + // make border + if (border == BORDER_MODE_REFLECT101) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1); + v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0); + } else if (border == BORDER_MODE_CONSTANT) { + v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { + v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0); + } + s16* drow = internal::getRowPtr(dstBase, dstStride, y); + + int16x8_t tcurr = vmovq_n_s16(0x0); + int16x8_t tnext = vmovq_n_s16(0x0); + int16x8_t t0, t2; + uint8x8_t xx0 = vmov_n_u8(0x0); + uint8x8_t xx1 = vmov_n_u8(0x0); + uint8x8_t xx2 = vmov_n_u8(0x0); + ptrdiff_t x = 0; + const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(v0 + x); + internal::prefetch(v1 + x); + internal::prefetch(v2 + x); + + uint8x8_t x0 = vld1_u8(v0 + x); + uint8x8_t x1 = vld1_u8(v1 + x); + uint8x8_t x2 = vld1_u8(v2 + x); + + if(x) { + xx0 = xx1; + xx1 = xx2; + } else { + xx1 = x1; + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) + { + xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7); + } + else if (border == BORDER_MODE_CONSTANT) + { + xx1 = vset_lane_u8(borderValue, x1, 7); + } + else if (border == BORDER_MODE_REFLECT101) + { + xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7); + } + } + xx2 = x1; + + if(x) { + tcurr = tnext; + } + tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)), + vreinterpretq_s16_u16(vshll_n_u8(x1, 2))); + + if(!x) { + tcurr = tnext; + continue; + } + t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7))); + t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1))); + t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr); + + vst1q_s16(drow + x - 8, t0); + } + + x -= 8; + if(x == cols){ + x--; + } + + for( ; x < cols; x++ ) + { + s16 nextx; + s16 prevx; + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) + { + prevx = x == 0 ? v1[0] : v1[x-1]; + nextx = x == cols-1 ? v1[x] : v1[x+1]; + } + else if (border == BORDER_MODE_REFLECT101) + { + prevx = x == 0 ? v1[1] : v1[x-1]; + nextx = x == cols-1 ? v1[x-1] : v1[x+1]; + } + else //if (border == BORDER_MODE_CONSTANT) + { + prevx = x == 0 ? borderValue : v1[x-1]; + nextx = x == cols-1 ? borderValue : v1[x+1]; + } + *(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +void Laplacian3OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border)); +#ifdef CAROTENE_NEON + ptrdiff_t rows = size.height, cols = size.width; + + std::vector _tmp; + u8 *tmp = 0; + if (border == BORDER_MODE_CONSTANT) + { + _tmp.assign(cols + 4,borderValue); + tmp = &_tmp[2]; + } + + for( ptrdiff_t y = 0; y < rows; y++ ) + { + const u8* v0 = 0; + const u8* v1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* v2 = 0; + // make border + if (border == BORDER_MODE_REFLECT101) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1); + v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0); + } else if (border == BORDER_MODE_CONSTANT) { + v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { + v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0); + } + s16* drow = internal::getRowPtr(dstBase, dstStride, y); + + int16x8_t tprev = vmovq_n_s16(0x0); + int16x8_t tcurr = vmovq_n_s16(0x0); + int16x8_t tnext = vmovq_n_s16(0x0); + int16x8_t tc = vmovq_n_s16(0x0); + int16x8_t t0, t2, tcnext; + ptrdiff_t x = 0; + const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(v0 + x); + internal::prefetch(v1 + x); + internal::prefetch(v2 + x); + + uint8x8_t x0 = vld1_u8(v0 + x); + uint8x8_t x1 = vld1_u8(v1 + x); + uint8x8_t x2 = vld1_u8(v2 + x); + tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2)); + + if(x) { + tprev = tcurr; + tcurr = tnext; + } + tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2)); + + if(!x) { + tcurr = tnext; + tc = tcnext; + + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) + { + tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7); + } + else if (border == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_s16(borderValue, tcurr, 7); + } + else if (border == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7); + } + continue; + } + + t0 = vextq_s16(tprev, tcurr, 7); + t2 = vextq_s16(tcurr, tnext, 1); + + t0 = vsubq_s16(vqaddq_s16(t0, t2), tc); + tc = tcnext; + + t0 = vshlq_n_s16(t0, 1); + vst1q_s16(drow + x - 8, t0); + } + x -= 8; + if(x == cols){ + x--; + } + + for( ; x < cols; x++ ) + { + s16 nextx, nextx2; + s16 prevx, prevx2; + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) + { + prevx = x == 0 ? v0[0] : v0[x-1]; + prevx2 = x == 0 ? v2[0] : v2[x-1]; + nextx = x == cols-1 ? v0[x] : v0[x+1]; + nextx2 = x == cols-1 ? v2[x] : v2[x+1]; + } + else if (border == BORDER_MODE_REFLECT101) + { + prevx = x == 0 ? v0[1] : v0[x-1]; + prevx2 = x == 0 ? v2[1] : v2[x-1]; + nextx = x == cols-1 ? v0[x-1] : v0[x+1]; + nextx2 = x == cols-1 ? v2[x-1] : v2[x+1]; + } + else //if (border == BORDER_MODE_CONSTANT) + { + prevx = x == 0 ? borderValue : v0[x-1]; + prevx2 = x == 0 ? borderValue : v2[x-1]; + nextx = x == cols-1 ? borderValue : v0[x+1]; + nextx2 = x == cols-1 ? borderValue : v2[x+1]; + } + s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2; + *(drow+x) = 2*res; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +void Laplacian5OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border)); +#ifdef CAROTENE_NEON + ptrdiff_t rows = size.height, cols = size.width; + + std::vector _tmp; + u8 *tmp = 0; + if (border == BORDER_MODE_CONSTANT) + { + _tmp.assign(cols + 4,borderValue); + tmp = &_tmp[2]; + } + + for( ptrdiff_t y = 0; y < rows; y++ ) + { + const u8* v0 = 0; + const u8* v1 = 0; + const u8* v2 = internal::getRowPtr(srcBase, srcStride, y); + const u8* v3 = 0; + const u8* v4 = 0; + // make border + if (border == BORDER_MODE_REPLICATE) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0); + v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0); + v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0); + } else if (border == BORDER_MODE_REFLECT) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0); + v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0); + v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0); + } else if (border == BORDER_MODE_REFLECT101) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check + v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0); + v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0); + v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1) + } else if (border == BORDER_MODE_CONSTANT) { + v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp; + v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp; + } + s16* drow = internal::getRowPtr(dstBase, dstStride, y); + + int16x8_t tnext, tc, t0; + int16x8_t tnext2, tnext3; + int16x8_t tnext1Old, tnext2Old, tnext3Old; + int16x8_t tnext4OldOldOld, tnext5OldOldOld; + + int16x8_t tcurr1 = vmovq_n_s16(0x0); + int16x8_t tnext1 = vmovq_n_s16(0x0); + int16x8_t tprev1 = vmovq_n_s16(0x0); + int16x8_t tpprev1 = vmovq_n_s16(0x0); + int16x8_t tppprev1 = vmovq_n_s16(0x0); + + int16x8_t tnext4Old = vmovq_n_s16(0x0); + int16x8_t tnext5Old = vmovq_n_s16(0x0); + int16x8_t tnext1OldOld = vmovq_n_s16(0x0); + int16x8_t tnext2OldOld = vmovq_n_s16(0x0); + int16x8_t tnext3OldOld = vmovq_n_s16(0x0); + int16x8_t tnext4OldOld = vmovq_n_s16(0x0); + int16x8_t tnext5OldOld = vmovq_n_s16(0x0); + + // do vertical convolution + ptrdiff_t x = 0; + const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(v0 + x); + internal::prefetch(v1 + x); + internal::prefetch(v2 + x); + internal::prefetch(v3 + x); + internal::prefetch(v4 + x); + + uint8x8_t x0 = vld1_u8(v0 + x); + uint8x8_t x1 = vld1_u8(v1 + x); + uint8x8_t x2 = vld1_u8(v2 + x); + uint8x8_t x3 = vld1_u8(v3 + x); + uint8x8_t x4 = vld1_u8(v4 + x); + if(x) { + tcurr1 = tnext1; + } + + tnext4OldOldOld = tnext4Old; + tnext5OldOldOld = tnext5Old; + tnext1Old = tnext1OldOld; + tnext2Old = tnext2OldOld; + tnext3Old = tnext3OldOld; + tnext4Old = tnext4OldOld; + tnext5Old = tnext5OldOld; + + tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1))); + tnext3 = vshlq_n_s16(tnext3, 1); + + tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2)); + tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0)); + tnext2 = vsubq_s16(tc, tnext); + + tnext1 = vaddq_s16(tnext3, tnext2); + // tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4 + + tnext2 = vshlq_n_s16(tnext2, 1); + // tnext2 = 2*x4 - 4*x2 + 2*x0 + + tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1)); + // tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4 + + tnext1OldOld = tnext1; + tnext2OldOld = tnext2; + tnext3OldOld = tnext3; + tnext4OldOld = tnext2; + tnext5OldOld = tnext1; + + if(x) { + tnext1 = vextq_s16(tnext1Old, tnext1, 2); + tcurr1 = vextq_s16(tnext2Old, tnext2, 1); + tprev1 = tnext3Old; + + if(x!=8) { + tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7); + tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6); + } + } + + if(!x) { + // make border + if (border == BORDER_MODE_REPLICATE) { + tpprev1 = vextq_s16(tnext2, tnext2, 7); + tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0); + + tprev1 = vextq_s16(tnext1, tnext1, 6); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1); + } else if (border == BORDER_MODE_REFLECT) { + tpprev1 = vextq_s16(tnext2, tnext2, 7); + tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0); + + tprev1 = vextq_s16(tnext1, tnext1, 6); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1); + } else if (border == BORDER_MODE_REFLECT101) { + tpprev1 = vextq_s16(tnext2, tnext2, 7); + tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0); + + tprev1 = vextq_s16(tnext1, tnext1, 6); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0); + } else if (border == BORDER_MODE_CONSTANT) { + tpprev1 = vextq_s16(tnext2, tnext2, 7); + tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0); + + tprev1 = vextq_s16(tnext1, tnext1, 6); + tprev1 = vsetq_lane_s16(borderValue, tprev1, 0); + tprev1 = vsetq_lane_s16(borderValue, tprev1, 1); + } + tppprev1 = tprev1; + continue; + } + + t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1); + t0 = vaddq_s16(t0, t0); + vst1q_s16(drow + x - 8, t0); + } + x -= 8; + if(x >= cols - 1) + x = cols-2; + + s16 pprevx = 0; + s16 prevx = 0; + s16 nextx = 0; + s16 nnextx = 0; + + for( ; x < cols; x++ ) + { + if (x == 0) { + // make border + if (border == BORDER_MODE_REPLICATE) { + pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0]; + prevx = 2*v0[0] - 4*v2[0] + 2*v4[0]; + } else if (border == BORDER_MODE_REFLECT) { + pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1]; + prevx = 2*v0[0] - 4*v2[0] + 2*v4[0]; + } else if (border == BORDER_MODE_REFLECT101) { + pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2]; + prevx = 2*v0[1] - 4*v2[1] + 2*v4[1]; + } else if (border == BORDER_MODE_CONSTANT) { + pprevx = 8 * borderValue; + prevx = 0; + } + } else if (x == 1) { + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) { + pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0]; + } else if (border == BORDER_MODE_REFLECT101) { + pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1]; + } else if (border == BORDER_MODE_CONSTANT) { + pprevx = 8 * borderValue; + } + prevx = 2*v0[0] - 4*v2[0] + 2*v4[0]; + } else { + pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2]; + prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1]; + } + s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x]; + if (x == cols-1) { + // make border + if (border == BORDER_MODE_REPLICATE) { + nextx = 2*v0[x] - 4*v2[x] + 2*v4[x]; + nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x]; + } else if (border == BORDER_MODE_REFLECT) { + nextx = 2*v0[x] - 4*v2[x] + 2*v4[x]; + nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1]; + } else if (border == BORDER_MODE_REFLECT101) { + nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1]; + nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2]; + } else if (border == BORDER_MODE_CONSTANT) { + nextx = 0; + nnextx = 8 * borderValue; + } + } else if (x == cols-2) { + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) { + nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1]; + } else if (border == BORDER_MODE_REFLECT101) { + nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x]; + } else if (border == BORDER_MODE_CONSTANT) { + nnextx = 8 * borderValue; + } + nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1]; + } else { + nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1]; + nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2]; + } + s16 res = pprevx + prevx + currx + nextx + nnextx; + *(drow+x) = 2*res; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/magnitude.cpp b/3rdparty/carotene/src/magnitude.cpp new file mode 100644 index 0000000000..cd9d82bf6c --- /dev/null +++ b/3rdparty/carotene/src/magnitude.cpp @@ -0,0 +1,160 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +struct Magnitude +{ + typedef s16 type; + + void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1, + int16x8_t & v_dst) const + { + int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1); + float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)), + vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p))); + v_src0_p = vget_high_s16(v_src0); + v_src1_p = vget_high_s16(v_src1); + float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)), + vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p))); + + int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0)); + int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1)); + + v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1)); + } + + void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1, + int16x4_t & v_dst) const + { + float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)), + vcvtq_f32_s32(vmull_s16(v_src1, v_src1))); + int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp)); + v_dst = vqmovn_s32(v_sqrt); + } + + void operator() (const short * src0, const short * src1, short * dst) const + { + f32 src0val = (f32)src0[0], src1val = (f32)src1[0]; + dst[0] = internal::saturate_cast((s32)sqrtf(src0val * src0val + src1val * src1val)); + } +}; + +struct MagnitudeF32 +{ + typedef f32 type; + + void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1, + float32x4_t & v_dst) const + { + v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1))); + } + + void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1, + float32x2_t & v_dst) const + { + v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1))); + } + + void operator() (const f32 * src0, const f32 * src1, f32 * dst) const + { + dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]); + } +}; + +} // namespace + +#endif + +void magnitude(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + Magnitude()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void magnitude(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + MagnitudeF32()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/meanstddev.cpp b/3rdparty/carotene/src/meanstddev.cpp new file mode 100644 index 0000000000..a847493429 --- /dev/null +++ b/3rdparty/carotene/src/meanstddev.cpp @@ -0,0 +1,163 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +void meanStdDev(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + f32 * pMean, f32 * pStdDev) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + f64 fsum = 0.0f, fsqsum = 0.0f; + sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1); + + // calc mean and stddev + f64 itotal = 1.0 / size.total(); + f64 mean = fsum * itotal; + f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); + + if (pMean) + *pMean = mean; + if (pStdDev) + *pStdDev = stddev; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMean; + (void)pStdDev; +#endif +} + +void meanStdDev(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + f32 * pMean, f32 * pStdDev) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3; + f64 fsum = 0.0f, fsqsum = 0.0f; + + f32 arsum[8]; + uint32x4_t v_zero = vdupq_n_u32(0u), v_sum; + float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum; + + for (size_t i = 0; i < size.height; ++i) + { + const u16 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0u; + + while (j < roiw4) + { + size_t blockSize = std::min(roiw4 - j, blockSize0) + j; + v_sum = v_zero; + v_sqsum = v_zero_f; + + for ( ; j + 16 < blockSize ; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); + + // 0 + uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0)); + uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0)); + v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); + float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo); + float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi); + v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); + v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); + + // 1 + v_srclo = vmovl_u16(vget_low_u16(v_src1)); + v_srchi = vmovl_u16(vget_high_u16(v_src1)); + v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); + v_srclo_f = vcvtq_f32_u32(v_srclo); + v_srchi_f = vcvtq_f32_u32(v_srchi); + v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); + v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); + } + + for ( ; j < blockSize; j += 4) + { + uint32x4_t v_src = vmovl_u16(vld1_u16(src + j)); + float32x4_t v_src_f = vcvtq_f32_u32(v_src); + v_sum = vaddq_u32(v_sum, v_src); + v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f); + } + + vst1q_f32(arsum, vcvtq_f32_u32(v_sum)); + vst1q_f32(arsum + 4, v_sqsum); + + fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3]; + fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7]; + } + + // collect a few last elements in the current row + for ( ; j < size.width; ++j) + { + f32 srcval = src[j]; + fsum += srcval; + fsqsum += srcval * srcval; + } + } + + // calc mean and stddev + f64 itotal = 1.0 / size.total(); + f64 mean = fsum * itotal; + f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); + + if (pMean) + *pMean = mean; + if (pStdDev) + *pStdDev = stddev; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMean; + (void)pStdDev; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/median_filter.cpp b/3rdparty/carotene/src/median_filter.cpp new file mode 100644 index 0000000000..8c5d08b7ee --- /dev/null +++ b/3rdparty/carotene/src/median_filter.cpp @@ -0,0 +1,227 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +/* + * The code here is based on the code in + * , which is in public domain. + * See also . + */ + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON +namespace { + + uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn) + { + u8 buf[16+8]; + vst1q_u8(buf+cn, r); + for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i]; + return vld1q_u8(buf); + } + + uint8x8_t getRightReplicate(uint8x8_t r, u32 cn) + { + u8 buf[8+8]; + vst1_u8(buf, r); + for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i]; + return vld1_u8(buf+cn); + } + +} // namespace + +//o------^-------^-----------------------------o 0 +// | | +//o--^---v---^---|-------^---------------------o 1 +// | | | | +//o--v-------v---|-------|-^-------^-------^---o 2 +// | | | | | +//o------^-------v-----^-|-|-------|-------|---o 3 +// | | | | | | +//o--^---v---^-----^---|-v-|---^---v---^---v---o 4 +// | | | | | | | +//o--v-------v---^-|---|---v---|-------|-------o 5 +// | | | | | +//o------^-------|-|---v-------|-------v-------o 6 +// | | | | +//o--^---v---^---|-v-----------v---------------o 7 +// | | | +//o--v-------v---v-----------------------------o 8 + +#define ELT(num, level) v ## num ## _lv ## level +#define PIX_SORT(a, alvl, b, blvl, newlvl) \ + PIX_MIN(a, alvl, b, blvl, newlvl); \ + PIX_MAX(a, alvl, b, blvl, newlvl); + +#define SORT9 \ + PIX_SORT(1, 00, 2, 00, 01); \ + PIX_SORT(4, 00, 5, 00, 02); \ + PIX_SORT(7, 00, 8, 00, 03); \ + PIX_SORT(0, 00, 1, 01, 04); \ + PIX_SORT(3, 00, 4, 02, 05); \ + PIX_SORT(6, 00, 7, 03, 06); \ + PIX_SORT(1, 04, 2, 01, 07); \ + PIX_SORT(4, 05, 5, 02, 08); \ + PIX_SORT(7, 06, 8, 03, 09); \ + PIX_MAX (0, 04, 3, 05, 10); \ + PIX_MIN (5, 08, 8, 09, 11); \ + PIX_SORT(4, 08, 7, 09, 12); \ + PIX_MAX (3, 10, 6, 06, 13); \ + PIX_MAX (1, 07, 4, 12, 14); \ + PIX_MIN (2, 07, 5, 11, 15); \ + PIX_MIN (4, 14, 7, 12, 16); \ + PIX_SORT(4, 16, 2, 15, 17); \ + PIX_MAX (6, 13, 4, 17, 18); \ + PIX_MIN (4, 18, 2, 17, 19); + +#endif + +bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels) +{ + return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8; +} + +void medianFilter3x3(const Size2D &size, u32 numChannels, + const u8 *srcBase, ptrdiff_t srcStride, + const Margin &srcMargin, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels)); +#ifdef CAROTENE_NEON + u32 cn = numChannels; + size_t colsn = size.width * cn; + + for (size_t i = 0; i < size.height; ++i) { + const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i); + const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride; + const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride; + u8* pdst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + { + uint8x16_t v3_lv00 = vld1q_u8(psrc0); + uint8x16_t v4_lv00 = vld1q_u8(psrc1); + uint8x16_t v5_lv00 = vld1q_u8(psrc2); + uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn); + uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn); + uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn); + uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn); + uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn); + uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn); + + goto medianBlur3x3_mainBody; + + for (; j < colsn - 16; j += 16) { + internal::prefetch(psrc0 + j); + internal::prefetch(psrc1 + j); + internal::prefetch(psrc2 + j); + + v0_lv00 = vld1q_u8(psrc0 + j - cn); + v1_lv00 = vld1q_u8(psrc1 + j - cn); + v2_lv00 = vld1q_u8(psrc2 + j - cn); + v3_lv00 = vld1q_u8(psrc0 + j); + v4_lv00 = vld1q_u8(psrc1 + j); + v5_lv00 = vld1q_u8(psrc2 + j); + v6_lv00 = vld1q_u8(psrc0 + j + cn); + v7_lv00 = vld1q_u8(psrc1 + j + cn); + v8_lv00 = vld1q_u8(psrc2 + j + cn); + +medianBlur3x3_mainBody: + +#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl)) +#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl)) + SORT9; +#undef PIX_MAX +#undef PIX_MIN + + vst1q_u8(pdst + j, v4_lv19); + } + } + + { + size_t k = colsn - 8; + uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn); + uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn); + uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn); + uint8x8_t v3_lv00 = vld1_u8(psrc0 + k); + uint8x8_t v4_lv00 = vld1_u8(psrc1 + k); + uint8x8_t v5_lv00 = vld1_u8(psrc2 + k); + uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn); + uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn); + uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn); + + goto medianBlur3x3_tailBody; + + for (; k >= j - 8; k -= 8) { + v0_lv00 = vld1_u8(psrc0 + k - cn); + v1_lv00 = vld1_u8(psrc1 + k - cn); + v2_lv00 = vld1_u8(psrc2 + k - cn); + v3_lv00 = vld1_u8(psrc0 + k); + v4_lv00 = vld1_u8(psrc1 + k); + v5_lv00 = vld1_u8(psrc2 + k); + v6_lv00 = vld1_u8(psrc0 + k + cn); + v7_lv00 = vld1_u8(psrc1 + k + cn); + v8_lv00 = vld1_u8(psrc2 + k + cn); + +medianBlur3x3_tailBody: + +#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl)) +#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl)) + SORT9; +#undef PIX_MAX +#undef PIX_MIN + + vst1_u8(pdst + k, v4_lv19); + } + } + } +#else + (void)size; + (void)numChannels; + (void)srcBase; + (void)srcStride; + (void)srcMargin; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/min_max.cpp b/3rdparty/carotene/src/min_max.cpp new file mode 100644 index 0000000000..d6f4017841 --- /dev/null +++ b/3rdparty/carotene/src/min_max.cpp @@ -0,0 +1,139 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +struct Min +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vminq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vmin(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = std::min(src0[0], src1[0]); + } +}; + +template +struct Max +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vmaxq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vmax(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = std::max(src0[0], src1[0]); + } +}; + +} // namespace + +#define IMPL_OP(fun, op, type) \ +void fun(const Size2D &size, \ + const type * src0Base, ptrdiff_t src0Stride, \ + const type * src1Base, ptrdiff_t src1Stride, \ + type * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + internal::vtransform(size, \ + src0Base, src0Stride, \ + src1Base, src1Stride, \ + dstBase, dstStride, op()); \ +} + +#else + +#define IMPL_OP(fun, op, type) \ +void fun(const Size2D &, \ + const type *, ptrdiff_t, \ + const type *, ptrdiff_t, \ + type *, ptrdiff_t) \ +{ \ + internal::assertSupportedConfiguration(); \ +} + +#endif + +#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type) + +IMPL_MINMAX(u8) +IMPL_MINMAX(s8) +IMPL_MINMAX(u16) +IMPL_MINMAX(s16) +IMPL_MINMAX(u32) +IMPL_MINMAX(s32) +IMPL_MINMAX(f32) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/minmaxloc.cpp b/3rdparty/carotene/src/minmaxloc.cpp new file mode 100644 index 0000000000..a7f30bc4f8 --- /dev/null +++ b/3rdparty/carotene/src/minmaxloc.cpp @@ -0,0 +1,1340 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +void minMaxVals(const Size2D &size, + const T * srcBase, ptrdiff_t srcStride, + T * pMinVal, T * pMaxVal) +{ + using namespace internal; + + typedef typename VecTraits::vec128 vec128; + typedef typename VecTraits::vec64 vec64; + + u32 step_base = 32 / sizeof(T), step_tail = 8 / sizeof(T); + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; + size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0; + + T maxVal = std::numeric_limits::min(); + T minVal = std::numeric_limits::max(); + vec128 v_min_base = vdupq_n(minVal), v_max_base = vdupq_n(maxVal); + vec64 v_min_tail = vdup_n(minVal), v_max_tail = vdup_n(maxVal); + + for (size_t i = 0; i < size.height; ++i) + { + const T * src = getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for (; j < roiw_base; j += step_base) + { + prefetch(src + j); + vec128 v_src0 = vld1q(src + j), v_src1 = vld1q(src + j + 16 / sizeof(T)); + v_min_base = vminq(v_min_base, v_src0); + v_max_base = vmaxq(v_max_base, v_src0); + v_min_base = vminq(v_min_base, v_src1); + v_max_base = vmaxq(v_max_base, v_src1); + } + for (; j < roiw_tail; j += step_tail) + { + vec64 v_src0 = vld1(src + j); + v_min_tail = vmin(v_min_tail, v_src0); + v_max_tail = vmax(v_max_tail, v_src0); + } + + for (; j < size.width; j++) + { + T srcval = src[j]; + minVal = std::min(srcval, minVal); + maxVal = std::max(srcval, maxVal); + } + } + + // collect min & max values + T ar[16 / sizeof(T)]; + vst1q(ar, vcombine(vmin(v_min_tail, vmin(vget_low(v_min_base), vget_high(v_min_base))), + vmax(v_max_tail, vmax(vget_low(v_max_base), vget_high(v_max_base))))); + + for (size_t x = 0; x < 8u / sizeof(T); ++x) + { + minVal = std::min(minVal, ar[x]); + maxVal = std::max(maxVal, ar[x + 8 / sizeof(T)]); + } + + if (pMaxVal) + *pMaxVal = maxVal; + if (pMinVal) + *pMinVal = minVal; +} + +} // namespace + +#endif + +void minMaxVals(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * pMinVal, u8 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxVals(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * pMinVal, s16 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxVals(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * pMinVal, u16 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxVals(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * pMinVal, s32 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxVals(const Size2D &size, + const u32 * srcBase, ptrdiff_t srcStride, + u32 * pMinVal, u32 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxLoc(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 &minVal, size_t &minCol, size_t &minRow, + f32 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const f32 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width >= 16) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t c4 = vdupq_n_u32(4); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (4 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFFC); +#else + { + size_t bound = size.width - (4 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + float32x4_t n_min = vdupq_n_f32(minVal); + uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC); + float32x4_t n_max = vdupq_n_f32(maxVal); + uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC); + + for(; i < bound; i+=4) + { + internal::prefetch(src + i); + float32x4_t line = vld1q_f32(src + i); + + uint32x4_t minmask = vcltq_f32(line, n_min); + uint32x4_t maxmask = vcgtq_f32(line, n_max); + + n_min = vbslq_f32(minmask, line, n_min); + n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx); + n_max = vbslq_f32(maxmask, line, n_max); + n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx); + + // idx[] +=4 + lineIdxOffset = vaddq_u32(lineIdxOffset, c4); + } + + f32 fmin[4], fmax[4]; + u32 fminIdx[4], fmaxIdx[4]; + + vst1q_f32(fmin, n_min); + vst1q_f32(fmax, n_max); + + vst1q_u32(fminIdx, n_minIdx); + vst1q_u32(fmaxIdx, n_maxIdx); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 4; ++j) + { + f32 minval = fmin[j]; + f32 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; ++i ) + { + float val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + const u8 * maskBase, ptrdiff_t maskStride, + f32 &minVal, size_t &minCol, size_t &minRow, + f32 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = std::numeric_limits::max(); + minCol = size.width; + minRow = size.height; + maxVal = -std::numeric_limits::max(); + maxCol = size.width; + maxRow = size.height; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const f32 * src = internal::getRowPtr( srcBase, srcStride, l); + const u8 * mask = internal::getRowPtr( maskBase, maskStride, l); + if (size.width >= 16) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t uOne = vdupq_n_u32(1); + uint32x4_t c4 = vdupq_n_u32(4); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (4 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFFC); +#else + { + size_t bound = size.width - (4 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + float32x4_t n_min = vdupq_n_f32(minVal); + uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC); + float32x4_t n_max = vdupq_n_f32(maxVal); + uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC); + + for(; i < bound; i+=4) + { + internal::prefetch(src + i); + internal::prefetch(mask + i); + float32x4_t line = vld1q_f32(src + i); + uint8x8_t maskLine = vld1_u8(mask + i); + + uint32x4_t maskLine4 = vmovl_u16(vget_low_u16(vmovl_u8(maskLine))); + maskLine4 = vcgeq_u32(maskLine4, uOne); + + uint32x4_t minmask = vcltq_f32(line, n_min); + uint32x4_t maxmask = vcgtq_f32(line, n_max); + + minmask = vandq_u32(minmask, maskLine4); + maxmask = vandq_u32(maxmask, maskLine4); + + n_min = vbslq_f32(minmask, line, n_min); + n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx); + n_max = vbslq_f32(maxmask, line, n_max); + n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx); + + // idx[] +=4 + lineIdxOffset = vaddq_u32(lineIdxOffset, c4); + } + + f32 fmin[4], fmax[4]; + u32 fminIdx[4], fmaxIdx[4]; + + vst1q_f32(fmin, n_min); + vst1q_f32(fmax, n_max); + + vst1q_u32(fminIdx, n_minIdx); + vst1q_u32(fmaxIdx, n_maxIdx); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 4; ++j) + { + f32 minval = fmin[j]; + f32 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; i++ ) + { + if (!mask[i]) + continue; + f32 val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)maskBase; + (void)maskStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 &minVal, size_t &minCol, size_t &minRow, + s32 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const s32 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width >= 16) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t c4 = vdupq_n_u32(4); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (4 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFFC); +#else + { + size_t bound = size.width - (4 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + int32x4_t n_min = vdupq_n_s32(minVal); + uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC); + int32x4_t n_max = vdupq_n_s32(maxVal); + uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC); + + for(; i < bound; i+=4 ) + { + internal::prefetch(src + i); + int32x4_t line = vld1q_s32(src + i); + + uint32x4_t minmask = vcltq_s32(line, n_min); + uint32x4_t maxmask = vcgtq_s32(line, n_max); + + n_min = vbslq_s32(minmask, line, n_min); + n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx); + n_max = vbslq_s32(maxmask, line, n_max); + n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx); + + // idx[] +=4 + lineIdxOffset = vaddq_u32(lineIdxOffset, c4); + } + + s32 fmin[4], fmax[4]; + u32 fminIdx[4], fmaxIdx[4]; + + vst1q_s32(fmin, n_min); + vst1q_s32(fmax, n_max); + + vst1q_u32(fminIdx, n_minIdx); + vst1q_u32(fmaxIdx, n_maxIdx); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 4; ++j) + { + s32 minval = fmin[j]; + s32 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; ++i ) + { + s32 val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 &minVal, size_t &minCol, size_t &minRow, + s16 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const s16 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width >= 32) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t c8 = vdupq_n_u32(8); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (8 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFF8); +#else + { + size_t bound = size.width - (8 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + int16x8_t n_min = vdupq_n_s16(minVal); + uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8); + uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8); + int16x8_t n_max = vdupq_n_s16(maxVal); + uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8); + uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8); + + for(; i < bound; i+=8 ) + { + internal::prefetch(src + i); + int16x8_t line = vld1q_s16(src + i); + + uint16x8_t minmask = vcltq_s16(line, n_min); + uint16x8_t maxmask = vcgtq_s16(line, n_max); + + n_min = vbslq_s16(minmask, line, n_min); + uint16x4_t minml = vget_low_u16(minmask); + uint16x4_t minmh = vget_high_u16(minmask); + uint32x4_t minml2 = vmovl_u16(minml); + uint32x4_t minmh2 = vmovl_u16(minmh); + minml2 = vqshlq_n_u32(minml2, 31); + minmh2 = vqshlq_n_u32(minmh2, 31); + n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl); + n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh); + + n_max = vbslq_s16(maxmask, line, n_max); + uint16x4_t maxml = vget_low_u16(maxmask); + uint16x4_t maxmh = vget_high_u16(maxmask); + uint32x4_t maxml2 = vmovl_u16(maxml); + uint32x4_t maxmh2 = vmovl_u16(maxmh); + maxml2 = vqshlq_n_u32(maxml2, 31); + maxmh2 = vqshlq_n_u32(maxmh2, 31); + n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl); + n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh); + + // idx[] +=8 + lineIdxOffset = vaddq_u32(lineIdxOffset, c8); + } + + // fix high part of indexes + uint32x4_t c4 = vdupq_n_u32((int32_t) 4); + n_minIdxh = vaddq_u32(n_minIdxh, c4); + n_maxIdxh = vaddq_u32(n_maxIdxh, c4); + + s16 fmin[8], fmax[8]; + u32 fminIdx[8], fmaxIdx[8]; + + vst1q_s16(fmin, n_min); + vst1q_s16(fmax, n_max); + vst1q_u32(fminIdx+0, n_minIdxl); + vst1q_u32(fmaxIdx+0, n_maxIdxl); + vst1q_u32(fminIdx+4, n_minIdxh); + vst1q_u32(fmaxIdx+4, n_maxIdxh); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 8; ++j) + { + s16 minval = fmin[j]; + s16 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFF8) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFF8) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; ++i ) + { + short val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 &minVal, size_t &minCol, size_t &minRow, + u16 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const u16 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width >= 32) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t c8 = vdupq_n_u32(8); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (8 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFF8); +#else + { + size_t bound = size.width - (8 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + uint16x8_t n_min = vdupq_n_u16(minVal); + uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8); + uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8); + uint16x8_t n_max = vdupq_n_u16(maxVal); + uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8); + uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8); + + for(; i < bound; i+=8 ) + { + internal::prefetch(src + i); + uint16x8_t line = vld1q_u16(src + i); + + uint16x8_t minmask = vcltq_u16(line, n_min); + uint16x8_t maxmask = vcgtq_u16(line, n_max); + + n_min = vbslq_u16(minmask, line, n_min); + uint16x4_t minml = vget_low_u16(minmask); + uint16x4_t minmh = vget_high_u16(minmask); + uint32x4_t minml2 = vmovl_u16(minml); + uint32x4_t minmh2 = vmovl_u16(minmh); + minml2 = vqshlq_n_u32(minml2, 31); + minmh2 = vqshlq_n_u32(minmh2, 31); + n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl); + n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh); + + n_max = vbslq_u16(maxmask, line, n_max); + uint16x4_t maxml = vget_low_u16(maxmask); + uint16x4_t maxmh = vget_high_u16(maxmask); + uint32x4_t maxml2 = vmovl_u16(maxml); + uint32x4_t maxmh2 = vmovl_u16(maxmh); + maxml2 = vqshlq_n_u32(maxml2, 31); + maxmh2 = vqshlq_n_u32(maxmh2, 31); + n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl); + n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh); + + // idx[] +=8 + lineIdxOffset = vaddq_u32(lineIdxOffset, c8); + } + + // fix high part of indexes + uint32x4_t c4 = vdupq_n_u32(4); + n_minIdxh = vaddq_u32(n_minIdxh, c4); + n_maxIdxh = vaddq_u32(n_maxIdxh, c4); + + u16 fmin[8], fmax[8]; + u32 fminIdx[8], fmaxIdx[8]; + + vst1q_u16(fmin, n_min); + vst1q_u16(fmax, n_max); + vst1q_u32(fminIdx+0, n_minIdxl); + vst1q_u32(fmaxIdx+0, n_maxIdxl); + vst1q_u32(fminIdx+4, n_minIdxh); + vst1q_u32(fmaxIdx+4, n_maxIdxh); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 8; ++j) + { + u16 minval = fmin[j]; + u16 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFF8) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFF8) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; ++i ) + { + u16 val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +#ifdef CAROTENE_NEON +namespace { + +void minMaxLocBlock(const u8 * src, u32 len, + u8 &minVal, u16 &minIdx, + u8 &maxVal, u16 &maxIdx) +{ + u16 tmp0123[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + + uint8x16_t n_min = vdupq_n_u8(src[0]); + uint16x8_t n_minIdxl = vdupq_n_u16(0); + uint16x8_t n_minIdxh = vdupq_n_u16(0); + uint8x16_t n_max = vdupq_n_u8(src[0]); + uint16x8_t n_maxIdxl = vdupq_n_u16(0); + uint16x8_t n_maxIdxh = vdupq_n_u16(0); + uint16x8_t c16 = vdupq_n_u16(16); + uint16x8_t lineIdxOffset = vld1q_u16(tmp0123); + + s32 i = 0; + s32 bound = len - (16 - 1); + for(; i < bound; i+=16 ) + { + internal::prefetch(src + i); + uint8x16_t line = vld1q_u8(src + i); + + uint8x16_t minmask = vcltq_u8(line, n_min); + uint8x16_t maxmask = vcgtq_u8(line, n_max); + + n_min = vbslq_u8(minmask, line, n_min); + uint8x8_t minml = vget_low_u8(minmask); + uint8x8_t minmh = vget_high_u8(minmask); + uint16x8_t minml2 = vmovl_u8(minml); + uint16x8_t minmh2 = vmovl_u8(minmh); + minml2 = vqshlq_n_u16(minml2, 15); + minmh2 = vqshlq_n_u16(minmh2, 15); + n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl); + n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh); + + n_max = vbslq_u8(maxmask, line, n_max); + uint8x8_t maxml = vget_low_u8(maxmask); + uint8x8_t maxmh = vget_high_u8(maxmask); + uint16x8_t maxml2 = vmovl_u8(maxml); + uint16x8_t maxmh2 = vmovl_u8(maxmh); + maxml2 = vqshlq_n_u16(maxml2, 15); + maxmh2 = vqshlq_n_u16(maxmh2, 15); + n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl); + n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh); + + // idx[] +=16 + lineIdxOffset = vaddq_u16(lineIdxOffset, c16); + } + + // fix high part of indexes + uint16x8_t c8 = vdupq_n_u16(8); + n_minIdxh = vaddq_u16(n_minIdxh, c8); + n_maxIdxh = vaddq_u16(n_maxIdxh, c8); + + u8 fmin[16], fmax[16]; + u16 fminIdx[16], fmaxIdx[16]; + /*{ + uint8x8_t min_low = vget_low_u8(n_min); + uint8x8_t min_high = vget_high_u8(n_min); + uint8x8_t max_low = vget_low_u8(n_max); + uint8x8_t max_high = vget_high_u8(n_max); + + uint8x8_t minmask = vclt_u8(min_low, min_high); + uint8x8_t maxmask = vcgt_u8(max_low, max_high); + + uint8x8_t min2 = vbsl_u8(minmask, min_low, min_high); + uint8x8_t max2 = vbsl_u8(maxmask, max_low, max_high); + + uint16x8_t minidxmask = vmovl_u8(minmask); + uint16x8_t maxidxmask = vmovl_u8(maxmask); + minidxmask = vqshlq_n_u16(minidxmask, 15); + maxidxmask = vqshlq_n_u16(maxidxmask, 15); + + uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh); + uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh); + + vst1_u8((uint8_t*)fmin, min2); + vst1_u8((uint8_t*)fmax, max2); + + vst1q_u16((uint16_t*)(fminIdx), n_minIdx); + vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx); + }*/ + + vst1q_u8(fmin, n_min); + vst1q_u8(fmax, n_max); + vst1q_u16(fminIdx+0, n_minIdxl); + vst1q_u16(fmaxIdx+0, n_maxIdxl); + vst1q_u16(fminIdx+8, n_minIdxh); + vst1q_u16(fmaxIdx+8, n_maxIdxh); + + minIdx = fminIdx[0]; + maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 16; ++j) + { + u8 minval = fmin[j]; + u8 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + + for(; i < (s32)len; ++i ) + { + u8 val = src[i]; + if( val < minVal ) + { + minVal = val; + minIdx = (u16)i; + } + else if( val > maxVal ) + { + maxVal = val; + maxIdx = (u16)i; + } + } +} + +void minMaxLocBlock(const s8 * src, u32 len, + s8 &minVal, u16 &minIdx, + s8 &maxVal, u16 &maxIdx) +{ + u16 tmp0123[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + + int8x16_t n_min = vdupq_n_s8(src[0]); + uint16x8_t n_minIdxl = vdupq_n_u16(0); + uint16x8_t n_minIdxh = vdupq_n_u16(0); + int8x16_t n_max = vdupq_n_s8(src[0]); + uint16x8_t n_maxIdxl = vdupq_n_u16(0); + uint16x8_t n_maxIdxh = vdupq_n_u16(0); + uint16x8_t c16 = vdupq_n_u16(16); + uint16x8_t lineIdxOffset = vld1q_u16(tmp0123); + + s32 i = 0; + s32 bound = len - (16 - 1); + for(; i < bound; i+=16 ) + { + internal::prefetch(src + i); + int8x16_t line = vld1q_s8(src + i); + + uint8x16_t minmask = vcltq_s8(line, n_min); + uint8x16_t maxmask = vcgtq_s8(line, n_max); + + n_min = vbslq_s8(minmask, line, n_min); + uint8x8_t minml = vget_low_u8(minmask); + uint8x8_t minmh = vget_high_u8(minmask); + uint16x8_t minml2 = vmovl_u8(minml); + uint16x8_t minmh2 = vmovl_u8(minmh); + minml2 = vqshlq_n_u16(minml2, 15); + minmh2 = vqshlq_n_u16(minmh2, 15); + n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl); + n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh); + + n_max = vbslq_s8(maxmask, line, n_max); + uint8x8_t maxml = vget_low_u8(maxmask); + uint8x8_t maxmh = vget_high_u8(maxmask); + uint16x8_t maxml2 = vmovl_u8(maxml); + uint16x8_t maxmh2 = vmovl_u8(maxmh); + maxml2 = vqshlq_n_u16(maxml2, 15); + maxmh2 = vqshlq_n_u16(maxmh2, 15); + n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl); + n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh); + + // idx[] +=16 + lineIdxOffset = vaddq_u16(lineIdxOffset, c16); + } + + // fix high part of indexes + uint16x8_t c8 = vdupq_n_u16(8); + n_minIdxh = vaddq_u16(n_minIdxh, c8); + n_maxIdxh = vaddq_u16(n_maxIdxh, c8); + + s8 fmin[16], fmax[16]; + u16 fminIdx[16], fmaxIdx[16]; + + vst1q_s8(fmin, n_min); + vst1q_s8(fmax, n_max); + vst1q_u16(fminIdx+0, n_minIdxl); + vst1q_u16(fmaxIdx+0, n_maxIdxl); + vst1q_u16(fminIdx+8, n_minIdxh); + vst1q_u16(fmaxIdx+8, n_maxIdxh); + + minIdx = fminIdx[0]; + maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 16; ++j) + { + s8 minval = fmin[j]; + s8 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + + for(; i < (s32)len; ++i ) + { + s8 val = src[i]; + if( val < minVal ) + { + minVal = val; + minIdx = (u16)i; + } + else if( val > maxVal ) + { + maxVal = val; + maxIdx = (u16)i; + } + } +} + +} // namespace +#endif // CAROTENE_NEON + +#define USHORT_BLOCK_MAX_SIZE (1 << 16) + +void minMaxLoc(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 &minVal, size_t &minCol, size_t &minRow, + u8 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0; l < size.height; ++l) + { + const u8 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width > 128) + { + for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE) + { + u8 locMinVal, locMaxVal; + u16 locMinIdx, locMaxIdx; + size_t tail = size.width - blockStart; + minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE, + locMinVal, locMinIdx, locMaxVal, locMaxIdx); + + if (locMinVal == 0 && locMaxVal == 255) + { + minCol = blockStart + locMinIdx; + maxCol = blockStart + locMaxIdx; + minRow = l; + maxRow = l; + minVal = 0; + maxVal = 255; + return; + } + else + { + if (locMinVal < minVal) + { + minCol = blockStart + locMinIdx; + minRow = l; + minVal = locMinVal; + } + if (locMaxVal > maxVal) + { + maxCol = blockStart + locMaxIdx; + maxRow = l; + maxVal = locMaxVal; + } + } + } + } + else + { + for(size_t i = 0; i < size.width; ++i ) + { + u8 val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } + + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 &minVal, size_t &minCol, size_t &minRow, + s8 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0; l < size.height; ++l) + { + const s8 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width > 128) + { + for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE) + { + s8 locMinVal, locMaxVal; + u16 locMinIdx, locMaxIdx; + size_t tail = size.width - blockStart; + minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE, + locMinVal, locMinIdx, locMaxVal, locMaxIdx); + + if (locMinVal == -128 && locMaxVal == 127) + { + minCol = blockStart + locMinIdx; + maxCol = blockStart + locMaxIdx; + minRow = l; + maxRow = l; + minVal = -128; + maxVal = 127; + return; + } + else + { + if (locMinVal < minVal) + { + minCol = blockStart + locMinIdx; + minRow = l; + minVal = locMinVal; + } + if (locMaxVal > maxVal) + { + maxCol = blockStart + locMaxIdx; + maxRow = l; + maxVal = locMaxVal; + } + } + } + } + else + { + for(size_t i = 0; i < size.width; ++i ) + { + s8 val = src[i]; + if( val < minVal ) + { + minVal = val; + minRow = l; + minCol = i; + } + else if( val > maxVal ) + { + maxVal = val; + maxRow = l; + maxCol = i; + } + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/morph.cpp b/3rdparty/carotene/src/morph.cpp new file mode 100644 index 0000000000..bcc6aa7e06 --- /dev/null +++ b/3rdparty/carotene/src/morph.cpp @@ -0,0 +1,728 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include +#include +#include +#include + +namespace CAROTENE_NS { + +bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 16 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE); +} + +#ifdef CAROTENE_NEON + +namespace { + +struct ErodeVecOp +{ + ErodeVecOp():borderValue(0){} + + ErodeVecOp(BORDER_MODE border, u8 borderValue_) : + borderValue(borderValue_) + { + if (border == BORDER_MODE_REPLICATE) + borderValue = std::numeric_limits::max(); + } + + inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const + { + return vminq_u8(a, b); + } + + inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const + { + return vmin_u8(a, b); + } + + inline u8 operator()(u8 a, u8 b) const + { + return std::min(a, b); + } + + u8 borderValue; +}; + +struct DilateVecOp +{ + DilateVecOp():borderValue(0){} + + DilateVecOp(BORDER_MODE border, u8 borderValue_) : + borderValue(borderValue_) + { + if (border == BORDER_MODE_REPLICATE) + borderValue = std::numeric_limits::min(); + } + + inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const + { + return vmaxq_u8(a, b); + } + + inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const + { + return vmax_u8(a, b); + } + + inline u8 operator()(u8 a, u8 b) const + { + return std::max(a, b); + } + + u8 borderValue; +}; + +template +void morph3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, const VecOp & vop) +{ + u8 borderValue = vop.borderValue; + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + + const uint8x16_t v_zero = vdupq_n_u8(0); + const uint8x16_t v_border = vdupq_n_u8(borderValue); + + uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero; + uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + u8 prevx = 0, currx = 0, nextx = 0; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16); + + // perform vertical convolution + for ( ; x <= bwidth; x += 16) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x); + uint8x16_t x1 = vld1q_u8(srow1 + x); + uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 16 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx = borderValue; + else + prevx = vop(srow1[x4], + vop(srow2 ? srow2[x4] : borderValue, + srow0 ? srow0[x4] : borderValue)); + + currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue)); + } + + // make shift + if (x) + { + tprev = tcurr; + tcurr = tnext; + } + + // and calculate next value + tnext = vop(vop(x0, x1), x2); + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr = v_border; + else if (border == BORDER_MODE_REPLICATE) + tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0)); + + continue; + } + + // combine 3 "shifted" vectors + t0 = vextq_u8(tprev, tcurr, 15); + t1 = tcurr; + t2 = vextq_u8(tcurr, tnext, 1); + + // and add them + t0 = vop(t0, vop(t1, t2)); + + vst1q_u8(drow + x - 16, t0); + } + + x -= 16; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + nextx = borderValue; + else if (border == BORDER_MODE_REPLICATE) + nextx = vop(srow2[x], vop(srow1[x], srow0[x])); + } + else + nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue, + srow0 ? srow0[x + 1] : borderValue), + srow1[x + 1]); + + drow[x] = vop(prevx, vop(currx, nextx)); + + // make shift + prevx = currx; + currx = nextx; + } + } +} + +} // namespace + +#endif + +void erode3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isMorph3x3Supported(size, border)); +#ifdef CAROTENE_NEON + morph3x3(size, + srcBase, srcStride, + dstBase, dstStride, + border, ErodeVecOp(border, borderValue)); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +void dilate3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isMorph3x3Supported(size, border)); +#ifdef CAROTENE_NEON + morph3x3(size, + srcBase, srcStride, + dstBase, dstStride, + border, DilateVecOp(border, borderValue)); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +#ifdef CAROTENE_NEON +namespace { + +template +void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize) +{ + size_t i, j, k; + size_t width16 = (width & -16) * cn; + size_t width8 = (width & -8) * cn; + width *= cn; + + if (ksize == 1) + { + for (i = 0; i < width; i++) + dst[i] = src[i]; + return; + } + + ksize = ksize*cn; + VecUpdate updateOp; + switch(cn) + { + case 1: + for (i = 0; i < width16; i += 16) + { + const u8* sptr = src + i; + uint8x16_t s = vld1q_u8(sptr); + internal::prefetch(sptr); + + for( k = 1; k < ksize; ++k) + s = updateOp(s, vld1q_u8(sptr + k)); + + vst1q_u8(dst + i, s); + } + + for (; i < width8; i += 8) + { + const u8* sptr = src + i; + uint8x8_t s = vld1_u8(sptr); + internal::prefetch(sptr); + + for( k = 1; k < ksize; ++k) + s = updateOp(s, vld1_u8(sptr + k)); + + vst1_u8(dst + i, s); + } + break; + default: + for (i = 0; i < width16; i += 16) + { + uint8x16_t s = vld1q_u8(src + i); + internal::prefetch(src + i); + + for (k = cn; k < ksize; k += cn) + s = updateOp(s, vld1q_u8(src + i + k)); + + vst1q_u8(dst + i, s); + } + + for (; i < width8; i += 8) + { + uint8x8_t s = vld1_u8(src + i); + internal::prefetch(src + i); + + for (k = cn; k < ksize; k += cn) + s = updateOp(s, vld1_u8(src + i + k)); + + vst1_u8(dst + i, s); + } + break; + } + + ptrdiff_t i0 = i; + for( k = 0; k < (size_t)cn; k++, src++, dst++ ) + { + for( i = i0; i <= width - cn*2; i += cn*2 ) + { + const u8* s = src + i; + u8 m = s[cn]; + for( j = cn*2; j < ksize; j += cn ) + m = updateOp(m, s[j]); + dst[i] = updateOp(m, s[0]); + dst[i+cn] = updateOp(m, s[j]); + } + + for( ; i < width; i += cn ) + { + const u8* s = src + i; + u8 m = s[0]; + for( j = cn; j < ksize; j += cn ) + m = updateOp(m, s[j]); + dst[i] = m; + } + } +} + +template +void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize) +{ + size_t i, k; + size_t width32 = width & -32; + VecUpdate updateOp; + + uint8x16_t x0,x1,s0,s1; + if (ksize == 3) + { + for (; count > 1; count -= 2, dst += dststep * 2, src += 2) + { + for (i = 0; i < width32; i += 32) + { + const u8* sptr = src[1] + i; + s0 = vld1q_u8(sptr); + s1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + sptr = src[2] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + s0 = updateOp(s0, x0); + s1 = updateOp(s1, x1); + + sptr = src[0] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + vst1q_u8(dst+i, updateOp(s0, x0)); + vst1q_u8(dst+i+16, updateOp(s1, x1)); + + sptr = src[3] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + vst1q_u8(dst + dststep + i, updateOp(s0, x0)); + vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1)); + + } + for(; i < width; i++ ) + { + u8 s = src[1][i]; + + for( k = 2; k < ksize; k++ ) + s = updateOp(s, src[k][i]); + + dst[i] = updateOp(s, src[0][i]); + dst[i+dststep] = updateOp(s, src[k][i]); + } + } + } + else if (ksize > 1) + for (; count > 1; count -= 2, dst += dststep*2, src += 2) + { + for (i = 0; i < width32; i += 32) + { + const u8* sptr = src[1] + i; + s0 = vld1q_u8(sptr); + s1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + for (k = 2; k < ksize; k++) + { + sptr = src[k] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + s0 = updateOp(s0, x0); + s1 = updateOp(s1, x1); + } + + sptr = src[0] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + vst1q_u8(dst+i, updateOp(s0, x0)); + vst1q_u8(dst+i+16, updateOp(s1, x1)); + + sptr = src[k] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + vst1q_u8(dst + dststep + i, updateOp(s0, x0)); + vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1)); + } + for(; i < width; i++ ) + { + u8 s = src[1][i]; + + for( k = 2; k < ksize; k++ ) + s = updateOp(s, src[k][i]); + + dst[i] = updateOp(s, src[0][i]); + dst[i+dststep] = updateOp(s, src[k][i]); + } + } + + for (; count > 0; count--, dst += dststep, src++) + { + for (i = 0; i < width32; i += 32) + { + const u8* sptr = src[0] + i; + s0 = vld1q_u8(sptr); + s1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + for (k = 1; k < ksize; k++) + { + sptr = src[k] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + s0 = updateOp(s0, x0); + s1 = updateOp(s1, x1); + } + + vst1q_u8(dst + i, s0); + vst1q_u8(dst + i + 16, s1); + } + for(; i < width; i++ ) + { + u8 s = src[0][i]; + for( k = 1; k < ksize; k++ ) + s = updateOp(s, src[k][i]); + dst[i] = s; + } + } +} + +template +inline void morphology(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin) +{ + //Temporary buffers common for all iterations + std::vector _srcRow(cn*(ssize.width + ksize.width - 1)); + u8* srcRow = &_srcRow[0]; + + size_t bufRows = std::max(ksize.height + 3, std::max(anchorY, ksize.height-anchorY-1)*2+1); + std::vector _rows(bufRows); + u8** rows = &_rows[0]; + + // adjust swidthcn so that the used part of buffers stays compact in memory + ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size) + std::vector _ringBuf(swidthcn*bufRows+16); + u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16); + + size_t borderLength = std::max(ksize.width - 1, 1) * cn; + std::vector _borderTab(borderLength); + ptrdiff_t * borderTab = &_borderTab[0]; + + std::vector _constBorderValue; + std::vector _constBorderRow; + u8 * constBorderValue = NULL; + u8 * constBorderRow = NULL; + if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT ) + { + _constBorderValue.resize(borderLength); + constBorderValue = &_constBorderValue[0]; + size_t i; + for(i = 0; i < cn; i++) + constBorderValue[i] = borderValues[i]; + for(; i < borderLength; i++) + constBorderValue[i] = constBorderValue[i-cn]; + + if( columnBorderType == BORDER_MODE_CONSTANT ) + { + _constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16)); + constBorderRow = internal::alignPtr(&_constBorderRow[0], 16); + size_t N = (ssize.width + ksize.width - 1)*cn; + for( i = 0; i < N; i += borderLength ) + { + size_t n = std::min( borderLength, N - i ); + for(size_t j = 0; j < n; j++) + srcRow[i+j] = constBorderValue[j]; + } + MorphRow(srcRow, constBorderRow, ssize.width, cn, ksize.width); + } + } + + Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right, + ssize.height + borderMargin.top + borderMargin.bottom); + + ptrdiff_t dx1 = std::max(anchorX - (ptrdiff_t)borderMargin.left, 0); + ptrdiff_t dx2 = std::max((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0); + // recompute border tables + if( dx1 > 0 || dx2 > 0 ) + { + if( rowBorderType == BORDER_MODE_CONSTANT ) + { + memcpy( srcRow, &constBorderValue[0], dx1*cn ); + memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn ); + } + else + { + ptrdiff_t xofs1 = std::min(borderMargin.left, anchorX) - borderMargin.left; + + ptrdiff_t wholeWidth = wholeSize.width; + + ptrdiff_t i, j; + for( i = 0; i < dx1; i++ ) + { + ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn; + for( j = 0; j < (ptrdiff_t)cn; j++ ) + borderTab[i*cn + j] = p0 + j; + } + + for( i = 0; i < dx2; i++ ) + { + ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn; + for( j = 0; j < (ptrdiff_t)cn; j++ ) + borderTab[(i + dx1)*cn + j] = p0 + j; + } + } + } + + ptrdiff_t startY, startY0, endY, rowCount; + startY = startY0 = std::max(borderMargin.top - anchorY, 0); + endY = std::min(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height); + + const u8* src = srcBase + (startY - borderMargin.top)*srcStride; + u8* dst = dstBase; + + ptrdiff_t width = ssize.width, kwidth = ksize.width; + ptrdiff_t kheight = ksize.height, ay = anchorY; + ptrdiff_t width1 = ssize.width + kwidth - 1; + ptrdiff_t xofs1 = std::min(borderMargin.left, anchorX); + bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT; + ptrdiff_t dy = 0, i = 0; + + src -= xofs1*cn; + ptrdiff_t count = endY - startY; + + rowCount = 0; + for(;; dst += dstStride*i, dy += i) + { + ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top; + dcount = dcount > 0 ? dcount : bufRows - kheight + 1; + dcount = std::min(dcount, count); + count -= dcount; + for( ; dcount-- > 0; src += srcStride ) + { + ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows; + u8* brow = ringBuf + bi*swidthcn; + + if( (size_t)(++rowCount) > bufRows ) + { + --rowCount; + ++startY; + } + + memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn ); + + if( makeBorder ) + { + for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ ) + srcRow[i] = src[borderTab[i]]; + for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ ) + srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]]; + } + + MorphRow(srcRow, brow, width, cn, ksize.width); + } + + ptrdiff_t max_i = std::min(bufRows, ssize.height - dy + (kheight - 1)); + for( i = 0; i < max_i; i++ ) + { + ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay, + wholeSize.height, columnBorderType); + if( srcY < 0 ) // can happen only with constant border type + rows[i] = constBorderRow; + else + { + if( srcY >= startY + rowCount ) + break; + ptrdiff_t bi = (srcY - startY0) % bufRows; + rows[i] = ringBuf + bi*swidthcn; + } + } + if( i < kheight ) + break; + i -= kheight - 1; + MorphColumn((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height); + } +} + +} // namespace +#endif // CAROTENE_NEON + +void erode(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin) +{ + internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 && + anchorX < ksize.width && anchorY < ksize.height); +#ifdef CAROTENE_NEON + morphology(ssize, cn, srcBase, srcStride, dstBase, dstStride, + ksize, anchorX, anchorY, rowBorderType, columnBorderType, + borderValues, borderMargin); +#else + (void)cn; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)rowBorderType; + (void)columnBorderType; + (void)borderValues; + (void)borderMargin; +#endif +} + +void dilate(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin) +{ + internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 && + anchorX < ksize.width && anchorY < ksize.height); +#ifdef CAROTENE_NEON + morphology(ssize, cn, srcBase, srcStride, dstBase, dstStride, + ksize, anchorX, anchorY, rowBorderType, columnBorderType, + borderValues, borderMargin); +#else + (void)cn; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)rowBorderType; + (void)columnBorderType; + (void)borderValues; + (void)borderMargin; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/mul.cpp b/3rdparty/carotene/src/mul.cpp new file mode 100644 index 0000000000..3bbbfc50aa --- /dev/null +++ b/3rdparty/carotene/src/mul.cpp @@ -0,0 +1,1572 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2016, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include +#include +#include +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +bool isIntegerScale(f32 scale) +{ + return std::fabs(scale - static_cast(scale)) < FLT_EPSILON; +} + +template +void mulu8(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + v_dst0 = vshrq_n_u16(v_dst0, shift); + v_dst1 = vshrq_n_u16(v_dst1, shift); + + vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0), vqmovn_u16(v_dst1))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + vst1_u8(dst + j, vqmovn_u16(vshrq_n_u16(v_dst, shift))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = internal::saturate_cast(val >> shift); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + v_dst0 = vshrq_n_u16(v_dst0, shift); + v_dst1 = vshrq_n_u16(v_dst1, shift); + + vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + vst1_u8(dst + j, vmovn_u16(vshrq_n_u16(v_dst, shift))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = (u8)(val >> shift); + } + } + } +} + +template +void muls16(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint16x8_t v_32767 = vdupq_n_u16(0x7FFF); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + v_dst0 = vshrq_n_u16(v_dst0, shift); + v_dst1 = vshrq_n_u16(v_dst1, shift); + + vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst0))); + vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst1))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + v_dst = vshrq_n_u16(v_dst, shift); + vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = internal::saturate_cast(val >> shift); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + v_dst0 = vshrq_n_u16(v_dst0, shift); + v_dst1 = vshrq_n_u16(v_dst1, shift); + + vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst0)); + vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(v_dst1)); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + v_dst = vshrq_n_u16(v_dst, shift); + vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst)); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = (s16)(val >> shift); + } + } + } +} + +typedef void (* mulFuncu8)(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + +typedef void (* mulFuncs16)(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + +} // namespace + +#endif + +void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + if ((scale * UCHAR_MAX * UCHAR_MAX) < 1.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + u8 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(u8) * size.width); + } + return; + } + + s32 iscale = static_cast(scale), exp = 0; + f32 significand = frexp(scale, &exp); + bool is_integer_scale = isIntegerScale(scale), + is_power_of_2 = (significand == 0.5f) && (exp <= 0); + exp = -exp + 1; + + if (is_power_of_2) + { + static const mulFuncu8 funcs[16] = + { + NULL, + mulu8<1>, + mulu8<2>, + mulu8<3>, + mulu8<4>, + mulu8<5>, + mulu8<6>, + mulu8<7>, + mulu8<8>, + mulu8<9>, + mulu8<10>, + mulu8<11>, + mulu8<12>, + mulu8<13>, + mulu8<14>, + mulu8<15> + }; + + mulFuncu8 func = funcs[exp]; + + func(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + cpolicy); + + return; + } + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0), vqmovn_u16(v_dst1))); + } + for (; j < roiw8; j += 8) + { + vst1_u8(dst + j, vqmovn_u16(vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + uint8x16_t v_src0 = vld1q_u8(src0 + j); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + float32x4_t v_dst2f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst3f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + uint16x8_t v_dst0u = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst0f)), + vqmovn_u32(vcvtq_u32_f32(v_dst1f))); + uint16x8_t v_dst1u = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst2f)), + vqmovn_u32(vcvtq_u32_f32(v_dst3f))); + vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0u), vqmovn_u16(v_dst1u))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale); + uint16x8_t v_dstu = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst0f)), + vqmovn_u32(vcvtq_u32_f32(v_dst1f))); + vst1_u8(dst + j, vqmovn_u16(v_dstu)); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = internal::saturate_cast((s32)trunc(fval)); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); + } + for (; j < roiw8; j += 8) + { + vst1_u8(dst + j, vmovn_u16(vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = (u8)(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + float32x4_t v_dst2f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst3f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + uint16x8_t v_dst0u = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + uint16x8_t v_dst1u = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst2f)), + vmovn_u32(vcvtq_u32_f32(v_dst3f))); + vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0u), vmovn_u16(v_dst1u))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale); + uint16x8_t v_dstu = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + vst1_u8(dst + j, vmovn_u16(v_dstu)); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = (u8)(s32)trunc(fval); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (((scale * UCHAR_MAX * UCHAR_MAX) < 1.0f) && (scale >= 0)) + { + for (size_t y = 0; y < size.height; ++y) + { + s16 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(s16) * size.width); + } + return; + } + + s32 iscale = static_cast(scale), exp = 0; + f32 significand = frexp(scale, &exp); + bool is_integer_scale = isIntegerScale(scale), + is_power_of_2 = (significand == 0.5f) && (exp <= 0); + exp = -exp + 1; + + if (is_power_of_2) + { + static const mulFuncs16 funcs[16] = + { + NULL, + muls16<1>, + muls16<2>, + muls16<3>, + muls16<4>, + muls16<5>, + muls16<6>, + muls16<7>, + muls16<8>, + muls16<9>, + muls16<10>, + muls16<11>, + muls16<12>, + muls16<13>, + muls16<14>, + muls16<15> + }; + + mulFuncs16 func = funcs[exp]; + + func(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + cpolicy); + + return; + } + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint16x8_t v_32767 = vdupq_n_u16(0x7FFF); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst0))); + vst1q_s16(dst + j +8, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst1))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + vst1q_s16(dst + j + 8, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale); + vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = internal::saturate_cast((s32)trunc(fval)); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst0)); + vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(v_dst1)); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst)); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = (s16)(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + vst1q_s16(dst + j + 8, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale); + vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = (s16)(s32)trunc(fval); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + if (scale == 0.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + s16 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(s16) * size.width); + } + return; + } + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + bool is_integer_scale = isIntegerScale(scale); + s32 iscale = static_cast(scale); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + + int16x8_t v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src1_p = vld1q_s16(src1 + j); + int16x8_t v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))), + vqmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p)))); + vst1q_s16(dst + j, v_dst); + + v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + v_src1_p = vld1q_s16(src1 + j + 8); + v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))), + vqmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p)))); + vst1q_s16(dst + j + 8, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vld1q_u8(src0 + j)))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0), vget_low_s16(v_src1))), + vqmovn_s32(vmull_s16(vget_high_s16(v_src0), vget_high_s16(v_src1)))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + s32 val = (s32)src0[j] * (s32)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + int16x8_t v_src1_p = vld1q_s16(src1 + j); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale); + vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vld1q_s16(src1 + j + 8); + v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale); + v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale); + vst1q_s16(dst + j + 8, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + int16x8_t v_src1 = vld1q_s16(src1 + j); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)))), scale); + vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = internal::saturate_cast((s32)trunc(fval)); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + + int16x8_t v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src1_p = vld1q_s16(src1 + j); + int16x8_t v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))), + vmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p)))); + vst1q_s16(dst + j, v_dst); + + v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + v_src1_p = vld1q_s16(src1 + j + 8); + v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))), + vmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p)))); + vst1q_s16(dst + j + 8, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vld1q_u8(src0 + j)))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0), vget_low_s16(v_src1))), + vmovn_s32(vmull_s16(vget_high_s16(v_src0), vget_high_s16(v_src1)))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + s32 val = (s32)src0[j] * (s32)src1[j]; + dst[j] = (s16)(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + int16x8_t v_src1_p = vld1q_s16(src1 + j); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale); + vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vld1q_s16(src1 + j + 8); + v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale); + v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale); + vst1q_s16(dst + j + 8, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + int16x8_t v_src1 = vld1q_s16(src1 + j); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)))), scale); + vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = (s16)(s32)trunc(fval); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +namespace { + +#ifdef CAROTENE_NEON + +template +inline T mulSaturateQ(const T &v1, const T &v2, const float scale) +{ + return internal::vcombine(internal::vqmovn(mulSaturateQ(internal::vmovl(internal::vget_low(v1)), + internal::vmovl(internal::vget_low(v2)), scale)), + internal::vqmovn(mulSaturateQ(internal::vmovl(internal::vget_high(v1)), + internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t mulSaturateQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_s32(v1), vcvtq_f32_s32(v2)), scale)); } +template <> +inline uint32x4_t mulSaturateQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(v1), vcvtq_f32_u32(v2)), scale)); } + +template +inline T mulSaturate(const T &v1, const T &v2, const float scale) +{ + return internal::vqmovn(mulSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t mulSaturate(const int32x2_t &v1, const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_n_f32(vmul_f32(vcvt_f32_s32(v1), vcvt_f32_s32(v2)), scale)); } +template <> +inline uint32x2_t mulSaturate(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_n_f32(vmul_f32(vcvt_f32_u32(v1), vcvt_f32_u32(v2)), scale)); } + + +template +inline T mulWrapQ(const T &v1, const T &v2, const float scale) +{ + return internal::vcombine(internal::vmovn(mulWrapQ(internal::vmovl(internal::vget_low(v1)), + internal::vmovl(internal::vget_low(v2)), scale)), + internal::vmovn(mulWrapQ(internal::vmovl(internal::vget_high(v1)), + internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t mulWrapQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_s32(v1), vcvtq_f32_s32(v2)), scale)); } +template <> +inline uint32x4_t mulWrapQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(v1), vcvtq_f32_u32(v2)), scale)); } + +template +inline T mulWrap(const T &v1, const T &v2, const float scale) +{ + return internal::vmovn(mulWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t mulWrap(const int32x2_t &v1, const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_n_f32(vmul_f32(vcvt_f32_s32(v1), vcvt_f32_s32(v2)), scale)); } +template <> +inline uint32x2_t mulWrap(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_n_f32(vmul_f32(vcvt_f32_u32(v1), vcvt_f32_u32(v2)), scale)); } + + +template inline uint8x16_t vshrq_n(const uint8x16_t & v0) { return vshrq_n_u8 (v0, n); } +template inline int8x16_t vshrq_n(const int8x16_t & v0) { return vshrq_n_s8 (v0, n); } +template inline uint16x8_t vshrq_n(const uint16x8_t & v0) { return vshrq_n_u16(v0, n); } +template inline int16x8_t vshrq_n(const int16x8_t & v0) { return vshrq_n_s16(v0, n); } +template inline uint32x4_t vshrq_n(const uint32x4_t & v0) { return vshrq_n_u32(v0, n); } +template inline int32x4_t vshrq_n(const int32x4_t & v0) { return vshrq_n_s32(v0, n); } +template inline uint64x2_t vshrq_n(const uint64x2_t & v0) { return vshrq_n_u64(v0, n); } +template inline int64x2_t vshrq_n(const int64x2_t & v0) { return vshrq_n_s64(v0, n); } + +template inline uint8x8_t vshr_n(const uint8x8_t & v0) { return vshr_n_u8 (v0, n); } +template inline int8x8_t vshr_n(const int8x8_t & v0) { return vshr_n_s8 (v0, n); } +template inline uint16x4_t vshr_n(const uint16x4_t & v0) { return vshr_n_u16(v0, n); } +template inline int16x4_t vshr_n(const int16x4_t & v0) { return vshr_n_s16(v0, n); } +template inline uint32x2_t vshr_n(const uint32x2_t & v0) { return vshr_n_u32(v0, n); } +template inline int32x2_t vshr_n(const int32x2_t & v0) { return vshr_n_s32(v0, n); } +template inline uint64x1_t vshr_n(const uint64x1_t & v0) { return vshr_n_u64(v0, n); } +template inline int64x1_t vshr_n(const int64x1_t & v0) { return vshr_n_s64(v0, n); } + +template inline uint8x16_t vrshrq_n(const uint8x16_t & v0) { return vrshrq_n_u8 (v0, n); } +template inline int8x16_t vrshrq_n(const int8x16_t & v0) { return vrshrq_n_s8 (v0, n); } +template inline uint16x8_t vrshrq_n(const uint16x8_t & v0) { return vrshrq_n_u16(v0, n); } +template inline int16x8_t vrshrq_n(const int16x8_t & v0) { return vrshrq_n_s16(v0, n); } +template inline uint32x4_t vrshrq_n(const uint32x4_t & v0) { return vrshrq_n_u32(v0, n); } +template inline int32x4_t vrshrq_n(const int32x4_t & v0) { return vrshrq_n_s32(v0, n); } +template inline uint64x2_t vrshrq_n(const uint64x2_t & v0) { return vrshrq_n_u64(v0, n); } +template inline int64x2_t vrshrq_n(const int64x2_t & v0) { return vrshrq_n_s64(v0, n); } + +template inline uint8x8_t vrshr_n(const uint8x8_t & v0) { return vrshr_n_u8 (v0, n); } +template inline int8x8_t vrshr_n(const int8x8_t & v0) { return vrshr_n_s8 (v0, n); } +template inline uint16x4_t vrshr_n(const uint16x4_t & v0) { return vrshr_n_u16(v0, n); } +template inline int16x4_t vrshr_n(const int16x4_t & v0) { return vrshr_n_s16(v0, n); } +template inline uint32x2_t vrshr_n(const uint32x2_t & v0) { return vrshr_n_u32(v0, n); } +template inline int32x2_t vrshr_n(const int32x2_t & v0) { return vrshr_n_s32(v0, n); } +template inline uint64x1_t vrshr_n(const uint64x1_t & v0) { return vrshr_n_u64(v0, n); } +template inline int64x1_t vrshr_n(const int64x1_t & v0) { return vrshr_n_s64(v0, n); } + +template +void mulShift(const Size2D &size, + const T * src0Base, ptrdiff_t src0Stride, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::vec128 wvec128; + typedef typename internal::VecTraits::vec64 vec64; + const size_t step128 = 16 / sizeof(T); + size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0; + const size_t step64 = 8 / sizeof(T); + size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0; + + wvec128 v_mask = internal::vdupq_n((WT)(1<(internal::vqsubq(v_mul0, vshrq_n(internal::vbicq(v_mask, v_mul0)) ))); + vec64 v_res1 = internal::vqmovn(vrshrq_n(internal::vqsubq(v_mul1, vshrq_n(internal::vbicq(v_mask, v_mul1)) ))); + + internal::vst1q(dst + j, internal::vcombine(v_res0, v_res1)); + } + for (; j < roiw64; j += step64) + { + wvec128 v_mul = internal::vmull(internal::vld1(src0 + j), internal::vld1(src1 + j)); + vec64 v_res = internal::vqmovn(vrshrq_n(internal::vqsubq(v_mul, vshrq_n(internal::vbicq(v_mask, v_mul)) ))); + internal::vst1(dst + j, v_res); + } + + for (; j < size.width; j++) + { + WT val = (WT)src0[j] * (WT)src1[j]; + dst[j] = internal::saturate_cast((val - (((1<> shift) + (1<<(shift-1))) >> shift); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + wvec128 v_mul0 = internal::vmull( internal::vget_low(v_src0), internal::vget_low(v_src1)); + wvec128 v_mul1 = internal::vmull(internal::vget_high(v_src0), internal::vget_high(v_src1)); + + vec64 v_res0 = internal::vmovn(vrshrq_n(internal::vqsubq(v_mul0, vshrq_n(internal::vbicq(v_mask, v_mul0)) ))); + vec64 v_res1 = internal::vmovn(vrshrq_n(internal::vqsubq(v_mul1, vshrq_n(internal::vbicq(v_mask, v_mul1)) ))); + + internal::vst1q(dst + j, internal::vcombine(v_res0, v_res1)); + } + for (; j < roiw64; j += step64) + { + wvec128 v_mul = internal::vmull(internal::vld1(src0 + j), internal::vld1(src1 + j)); + vec64 v_res = internal::vmovn(vrshrq_n(internal::vqsubq(v_mul, vshrq_n(internal::vbicq(v_mask, v_mul)) ))); + internal::vst1(dst + j, v_res); + } + + for (; j < size.width; j++) + { + WT val = (WT)src0[j] * (WT)src1[j]; + dst[j] = (T)((val - (((1<> shift) + (1<<(shift-1))) >> shift); + } + } + } +} +#endif + +template +void mul(const Size2D &size, + const T * src0Base, ptrdiff_t src0Stride, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + typedef typename internal::VecTraits::vec128 vec128; + + typedef void (* mulFunc)(const Size2D &size, + const T * src0Base, ptrdiff_t src0Stride, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + + if (scale == 0.0f || + (std::numeric_limits::is_integer && + (scale * std::numeric_limits::max() * std::numeric_limits::max()) < 1.0f && + (scale * std::numeric_limits::max() * std::numeric_limits::max()) > -1.0f)) + { + for (size_t y = 0; y < size.height; ++y) + { + T * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(T) * size.width); + } + return; + } + + s32 iscale = static_cast(scale), exp = 0; + f32 significand = frexp(scale, &exp); + bool is_integer_scale = isIntegerScale(scale), + is_power_of_2 = (significand == 0.5f) && (exp <= 0); + exp = -exp + 1; + + if (is_power_of_2) + { + static const mulFunc funcs[16] = + { + NULL, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift + }; + + mulFunc func = funcs[exp]; + + func(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + cpolicy); + + return; + } + + const size_t step128 = 16 / sizeof(T); + size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0; + const size_t step64 = 8 / sizeof(T); + size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const T * src1 = internal::getRowPtr(src1Base, src1Stride, i); + T * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + internal::vst1q(dst + j, internal::vcombine( + internal::vqmovn(internal::vmull(internal::vget_low(v_src0), + internal::vget_low(v_src1))), + internal::vqmovn(internal::vmull(internal::vget_high(v_src0), + internal::vget_high(v_src1))) + ) + ); + } + for (; j < roiw64; j += step64) + { + internal::vst1(dst + j, internal::vqmovn(internal::vmull(internal::vld1(src0 + j), + internal::vld1(src1 + j)))); + } + + for (; j < size.width; j++) + { + WT val = (WT)src0[j] * (WT)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + internal::vst1q(dst + j, mulSaturateQ(internal::vld1q(src0 + j), + internal::vld1q(src1 + j), scale)); + } + for (; j < roiw64; j += step64) + { + internal::vst1(dst + j, mulSaturate(internal::vld1(src0 + j), + internal::vld1(src1 + j), scale)); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = internal::saturate_cast(fval); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + internal::vst1q(dst + j, internal::vcombine( + internal::vmovn(internal::vmull(internal::vget_low(v_src0), + internal::vget_low(v_src1))), + internal::vmovn(internal::vmull(internal::vget_high(v_src0), + internal::vget_high(v_src1))) + ) + ); + } + for (; j < roiw64; j += step64) + { + internal::vst1(dst + j, internal::vmovn(internal::vmull(internal::vld1(src0 + j), + internal::vld1(src1 + j)))); + } + + for (; j < size.width; j++) + { + WT val = (WT)src0[j] * (WT)src1[j]; + dst[j] = (T)(val); + } + } + else // generic case using floats + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + internal::vst1q(dst + j, mulWrapQ(internal::vld1q(src0 + j), + internal::vld1q(src1 + j), scale)); + } + for (; j < roiw64; j += step64) + { + internal::vst1(dst + j, mulWrap(internal::vld1(src0 + j), + internal::vld1(src1 + j), scale)); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = (T)((s32)trunc(fval)); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +} + +void mul(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + mul(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void mul(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + mul(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void mul(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + mul(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void mul(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f64 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + typedef void (* mulFunc)(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + + if (!std::isnormal(scale) || + ((scale * std::numeric_limits::max() * std::numeric_limits::max()) < 1.0f && + (scale * std::numeric_limits::max() * std::numeric_limits::max()) > -1.0f)) + { + for (size_t y = 0; y < size.height; ++y) + { + s32 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(s32) * size.width); + } + return; + } + + s32 iscale = static_cast(scale), exp = 0; + f64 significand = frexp(scale, &exp); + bool is_integer_scale = isIntegerScale(scale), + is_power_of_2 = (significand == 0.5) && (exp <= 0); + exp = -exp + 1; + + if (is_power_of_2) + { + static const mulFunc funcs[16] = + { + NULL, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift + }; + + mulFunc func = funcs[exp]; + + func(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + cpolicy); + + return; + } + + size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; + size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + int32x4_t v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + internal::vst1q(dst + j, internal::vcombine( + internal::vqmovn(internal::vmull(internal::vget_low(v_src0), + internal::vget_low(v_src1))), + internal::vqmovn(internal::vmull(internal::vget_high(v_src0), + internal::vget_high(v_src1))) + ) + ); + } + for (; j < roiw64; j += 2) + { + internal::vst1(dst + j, internal::vqmovn(internal::vmull(internal::vld1(src0 + j), + internal::vld1(src1 + j)))); + } + + for (; j < size.width; j++) + { + s64 val = (s64)src0[j] * (s64)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < size.width; j++) + { + f64 fval = src0[j] * src1[j] * scale; + dst[j] = internal::saturate_cast(fval); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + int32x4_t v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + internal::vst1q(dst + j, internal::vcombine( + internal::vmovn(internal::vmull(internal::vget_low(v_src0), + internal::vget_low(v_src1))), + internal::vmovn(internal::vmull(internal::vget_high(v_src0), + internal::vget_high(v_src1))) + ) + ); + } + for (; j < roiw64; j += 2) + { + internal::vst1(dst + j, internal::vmovn(internal::vmull(internal::vld1(src0 + j), + internal::vld1(src1 + j)))); + } + + for (; j < size.width; j++) + { + s64 val = (s64)src0[j] * (s64)src1[j]; + dst[j] = (s32)(val); + } + } + else // generic case using floats + { + for (; j < size.width; j++) + { + f64 fval = src0[j] * src1[j] * scale; + dst[j] = (s32)trunc(fval); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +void mul(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (scale == 0.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + f32 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(f32) * size.width); + } + return; + } + + size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; + size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; + + if (std::fabs(scale - 1.0f) < FLT_EPSILON) + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vst1q_f32(dst + j, vmulq_f32(vld1q_f32(src0 + j), vld1q_f32(src1 + j))); + } + + for (; j < roiw64; j += 2) + { + vst1_f32(dst + j, vmul_f32(vld1_f32(src0 + j), vld1_f32(src1 + j))); + } + + for (; j < size.width; j++) + { + dst[j] = src0[j] * src1[j]; + } + } + } + else + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vst1q_f32(dst + j, vmulq_n_f32(vmulq_f32(vld1q_f32(src0 + j), vld1q_f32(src1 + j)), scale)); + } + + for (; j < roiw64; j += 2) + { + vst1_f32(dst + j, vmul_n_f32(vmul_f32(vld1_f32(src0 + j), vld1_f32(src1 + j)), scale)); + } + + for (; j < size.width; j++) + { + dst[j] = src0[j] * src1[j] * scale; + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)scale; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/norm.cpp b/3rdparty/carotene/src/norm.cpp new file mode 100644 index 0000000000..6ff2456597 --- /dev/null +++ b/3rdparty/carotene/src/norm.cpp @@ -0,0 +1,1310 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +//magic number; must be multiple of 4 +#define NORM32F_BLOCK_SIZE 2048 + +s32 normInf(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 16) + { + uint8x16_t s = vld1q_u8(src); + for (i = 16; i <= size.width - 16; i += 16) + { + internal::prefetch(src + i); + uint8x16_t s1 = vld1q_u8(src + i); + s = vmaxq_u8(s1, s); + } + u8 s2[8]; + uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s)); + vst1_u8(s2, s3); + for (u32 j = 0; j < 8; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max((s32)(src[i]), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normInf(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 16) + { + uint8x16_t s = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src))); + for (i = 16; i <= size.width - 16; i += 16) + { + internal::prefetch(src + i); + uint8x16_t s1 = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src + i))); + s = vmaxq_u8(s1, s); + } + u8 s2[8]; + uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s)); + vst1_u8(s2, s3); + for (u32 j = 0; j < 8; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max((s32)(std::abs(src[i])), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normInf(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 8) + { + uint16x8_t s = vld1q_u16(src); + for (i = 8; i <= size.width - 8; i += 8) + { + internal::prefetch(src + i); + uint16x8_t s1 = vld1q_u16(src + i); + s = vmaxq_u16(s1, s); + } + u16 s2[4]; + uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s)); + vst1_u16(s2, s3); + for (u32 j = 0; j < 4; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max((s32)(src[i]), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normInf(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 8) + { + uint16x8_t s = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src))); + for (i = 8; i <= size.width - 8; i += 8) + { + internal::prefetch(src + i); + uint16x8_t s1 = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src + i))); + s = vmaxq_u16(s1, s); + } + u16 s2[4]; + uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s)); + vst1_u16(s2, s3); + for (u32 j = 0; j < 4; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max(std::abs((s32)(src[i])), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normInf(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 4) + { + uint32x4_t s = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src))); + for (i = 4; i <= size.width - 4; i += 4) + { + internal::prefetch(src + i); + uint32x4_t s1 = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src + i))); + s = vmaxq_u32(s1, s); + } + u32 s2[2]; + uint32x2_t s3 = vmax_u32(vget_low_u32(s), vget_high_u32(s)); + vst1_u32(s2, s3); + for (u32 j = 0; j < 2; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max((s32)(std::abs(src[i])), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +f32 normInf(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + f32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 4) + { + float32x4_t s = vabsq_f32(vld1q_f32(src)); + for (i = 4; i <= size.width - 4; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t s1 = vld1q_f32(src + i); + float32x4_t sa = vabsq_f32(s1); + s = vmaxq_f32(sa, s); + } + f32 s2[2]; + float32x2_t s3 = vmax_f32(vget_low_f32(s), vget_high_f32(s)); + vst1_f32(s2, s3); + for (u32 j = 0; j < 2; j++) + result = std::max(s2[j], result); + } + for (; i < size.width; i++) + result = std::max(std::abs(src[i]), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +s32 normL1(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + uint32x4_t vs = vmovq_n_u32(0); + for (; i < roiw8;) + { + size_t limit = std::min(size.width, i + 256) - 8; + uint8x8_t s0 = vld1_u8(src + i); + uint16x8_t s = vmovl_u8(s0); + + for (i += 8; i <= limit; i += 8) + { + internal::prefetch(src + i); + uint8x8_t s1 = vld1_u8(src + i); + s = vaddw_u8(s, s1); + } + + uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s)); + vs = vaddw_u16(vs, s4); + } + + u32 s2[2]; + uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + vst1_u32(s2, vs2); + + result += (s32)(s2[0] + s2[1]); + + for ( ; i < size.width; i++) + result += (s32)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normL1(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + uint32x4_t vs = vmovq_n_u32(0); + + for (; i < roiw8;) + { + size_t limit = std::min(size.width, i + 256) - 8; + uint8x8_t s0 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i))); + uint16x8_t s = vmovl_u8(s0); + + for (i += 8; i <= limit; i += 8) + { + internal::prefetch(src + i); + uint8x8_t s1 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i))); + s = vaddw_u8(s, s1); + } + + uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s)); + vs = vaddw_u16(vs, s4); + } + + u32 s2[2]; + uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + vst1_u32(s2, vs2); + + result += (s32)(s2[0] + s2[1]); + + for ( ; i < size.width; i++) + result += (s32)(std::abs(src[i])); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normL1(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + uint32x4_t vs = vmovq_n_u32(0); + for (; i < roiw4; i += 4) + { + internal::prefetch(src + i); + uint16x4_t s = vld1_u16(src + i); + vs = vaddw_u16(vs, s); + } + u32 s2[4]; + vst1q_u32(s2, vs); + for (u32 j = 0; j < 4; j++) + result += s2[j]; + for ( ; i < size.width; i++) + result += (s32)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normL1(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + uint32x4_t vs = vmovq_n_u32(0); + for (; i < roiw4; i += 4) + { + internal::prefetch(src + i); + uint16x4_t s = vreinterpret_u16_s16(vabs_s16(vld1_s16(src + i))); + vs = vaddw_u16(vs, s); + } + u32 s2[4]; + vst1q_u32(s2, vs); + for (u32 j = 0; j < 4; j++) + result += s2[j]; + for ( ; i < size.width; i++) + result += (s32)(std::abs(src[i])); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +f64 normL1(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i))); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t s1 = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i))); + s = vaddq_f32(s, s1); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + for ( ; i < size.width; i++) + result += (f64)(std::abs(src[i])); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +f64 normL1(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vabsq_f32(vld1q_f32(src + i)); + for (i += 4; i <= limit; i += 4) + { + internal::prefetch(src + i); + float32x4_t s1 = vld1q_f32(src + i); + float32x4_t sa = vabsq_f32(s1); + s = vaddq_f32(sa, s); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + for (; i < size.width; i++) + result += std::abs((f64)(src[i])); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +s32 normL2(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + uint32x4_t sl = vmovq_n_u32(0); + uint32x4_t sh = vmovq_n_u32(0); + + for (; i < roiw8; i += 8) + { + internal::prefetch(src + i); + uint8x8_t s1 = vld1_u8(src + i); + uint16x8_t sq = vmull_u8(s1, s1); + + sl = vaddw_u16(sl, vget_low_u16(sq)); + sh = vaddw_u16(sh, vget_high_u16(sq)); + } + + uint32x4_t s = vaddq_u32(sl, sh); + uint32x2_t ss = vadd_u32(vget_low_u32(s), vget_high_u32(s)); + + u32 s2[2]; + vst1_u32(s2, ss); + + result += (s32)(s2[0] + s2[1]); + + for (; i < size.width; i++) + result += (s32)(src[i]) * (s32)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normL2(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + int32x4_t sl = vmovq_n_s32(0); + int32x4_t sh = vmovq_n_s32(0); + + for (; i < roiw8; i += 8) + { + internal::prefetch(src + i); + int8x8_t s1 = vld1_s8(src + i); + int16x8_t sq = vmull_s8(s1, s1); + + sl = vaddw_s16(sl, vget_low_s16(sq)); + sh = vaddw_s16(sh, vget_high_s16(sq)); + } + + int32x4_t s = vaddq_s32(sl, sh); + int32x2_t ss = vadd_s32(vget_low_s32(s), vget_high_s32(s)); + + s32 s2[2]; + vst1_s32(s2, ss); + + result += s2[0] + s2[1]; + + for (; i < size.width; i++) + result += (s32)(src[i]) * (s32)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +f64 normL2(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + uint16x4_t s0 = vld1_u16(src+i); + float32x4_t s = vcvtq_f32_u32(vmull_u16(s0,s0)); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + uint16x4_t s1 = vld1_u16(src+i); + float32x4_t sq = vcvtq_f32_u32(vmull_u16(s1, s1)); + s = vaddq_f32(s, sq); + } + f32 s2[4]; + vst1q_f32(s2, s); + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + + for ( ; i < size.width; i++) + result += (f64)(src[i]) * (f64)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +f64 normL2(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + int16x4_t s0 = vld1_s16(src+i); + float32x4_t s = vcvtq_f32_s32(vmull_s16(s0,s0)); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + int16x4_t s1 = vld1_s16(src+i); + float32x4_t sq = vcvtq_f32_s32(vmull_s16(s1, s1)); + s = vaddq_f32(s, sq); + } + f32 s2[4]; + vst1q_f32(s2, s); + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + + for ( ; i < size.width; i++) + result += (f64)(src[i]) * (f64)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +f64 normL2(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vcvtq_f32_s32(vld1q_s32(src + i)); + s = vmulq_f32(s, s); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t s1 = vcvtq_f32_s32(vld1q_s32(src + i)); + s = vmlaq_f32(s, s1, s1); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + for ( ; i < size.width; i++) + result += (f64)(src[i]) * (f64)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +f64 normL2(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vld1q_f32(src + i); + s = vmulq_f32(s, s); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t s1 = vld1q_f32(src + i); + s = vmlaq_f32(s, s1, s1); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + for ( ; i < size.width; i++) + result += (f64)(src[i]) * (f64)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +s32 diffNormInf(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 16) + { + uint8x16_t vs3 = vdupq_n_u8(0); + for (; i < size.width - 16; i += 16) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + uint8x16_t vs1 = vld1q_u8(src1 + i); + uint8x16_t vs2 = vld1q_u8(src2 + i); + + vs3 = vmaxq_u8(vs3, vabdq_u8(vs1, vs2)); + } + + u8 s2[8]; + vst1_u8(s2, vpmax_u8(vget_low_u8(vs3), vget_high_u8(vs3))); + + for (u32 j = 0; j < 8; j++) + result = std::max((s32)(s2[j]), result); + } + + for (; i < size.width; i++) + { + result = std::max(std::abs((s32)(src1[i]) - (s32)(src2[i])), result); + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f32 diffNormInf(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + f32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 4) + { + float32x4_t s = vabdq_f32(vld1q_f32(src1), vld1q_f32(src2)); + + for (i += 4; i <= size.width - 4; i += 4 ) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + float32x4_t vs1 = vld1q_f32(src1 + i); + float32x4_t vs2 = vld1q_f32(src2 + i); + + float32x4_t vd = vabdq_f32(vs2, vs1); + s = vmaxq_f32(s, vd); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + if (s2[j] > result) + result = s2[j]; + } + + for (; i < size.width; i++) + { + f32 v = std::abs(src1[i] - src2[i]); + if (v > result) + result = v; + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0.; +#endif +} + +s32 diffNormL1(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 16) + { + for(; i <= size.width - 16;) + { + size_t limit = std::min(size.width, i + 2*256) - 16; + uint16x8_t si1 = vmovq_n_u16(0); + uint16x8_t si2 = vmovq_n_u16(0); + + for (; i <= limit; i += 16) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + uint8x16_t vs1 = vld1q_u8(src1 + i); + uint8x16_t vs2 = vld1q_u8(src2 + i); + + si1 = vabal_u8(si1, vget_low_u8(vs1), vget_low_u8(vs2)); + si2 = vabal_u8(si2, vget_high_u8(vs1), vget_high_u8(vs2)); + } + + u32 s2[4]; + vst1q_u32(s2, vaddq_u32(vpaddlq_u16(si1), vpaddlq_u16(si2))); + + for (u32 j = 0; j < 4; j++) + { + if ((s32)(0x7fFFffFFu - s2[j]) <= result) + { + return 0x7fFFffFF; //result already saturated + } + result = (s32)((u32)(result) + s2[j]); + } + } + + } + + for (; i < size.width; i++) + { + u32 v = std::abs((s32)(src1[i]) - (s32)(src2[i])); + + if ((s32)(0x7fFFffFFu - v) <= result) + { + return 0x7fFFffFF; //result already saturated + } + result = (s32)((u32)(result) + v); + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f64 diffNormL1(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 4) + { + for(; i <= size.width - 4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vmovq_n_f32(0.0f); + + for (; i <= limit; i += 4 ) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + float32x4_t vs1 = vld1q_f32(src1 + i); + float32x4_t vs2 = vld1q_f32(src2 + i); + + float32x4_t vd = vabdq_f32(vs2, vs1); + s = vaddq_f32(s, vd); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + } + + for (; i < size.width; i++) + { + f32 v = std::abs(src1[i] - src2[i]); + result += (f64)(v); + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0.; +#endif +} + +s32 diffNormL2(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + +#define NORML28U_BLOCK_SIZE (33024*2) //bigger block size can result in integer overflow + if (size.width >= 16) + { + for(; i <= size.width - 16;) + { + size_t limit = std::min(size.width, i + NORML28U_BLOCK_SIZE) - 16; + uint32x4_t si1 = vmovq_n_u32(0); + uint32x4_t si2 = vmovq_n_u32(0); + + for (; i <= limit; i += 16) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + uint8x16_t vs1 = vld1q_u8(src1 + i); + uint8x16_t vs2 = vld1q_u8(src2 + i); + + uint16x8_t vdlo = vabdl_u8(vget_low_u8(vs1), vget_low_u8(vs2)); + uint16x8_t vdhi = vabdl_u8(vget_high_u8(vs1), vget_high_u8(vs2)); + + si1 = vmlal_u16(si1, vget_low_u16(vdlo), vget_low_u16(vdlo)); + si2 = vmlal_u16(si2, vget_high_u16(vdlo), vget_high_u16(vdlo)); + + si1 = vmlal_u16(si1, vget_low_u16(vdhi), vget_low_u16(vdhi)); + si2 = vmlal_u16(si2, vget_high_u16(vdhi), vget_high_u16(vdhi)); + } + + u32 s2[4]; + vst1q_u32(s2, vqaddq_u32(si1, si2)); + + for (u32 j = 0; j < 4; j++) + { + if ((s32)(0x7fFFffFFu - s2[j]) <= result) + { + return 0x7fFFffFF; //result already saturated + } + result += (s32)s2[j]; + } + } + + } + + for (; i < size.width; i++) + { + s32 v = (s32)(src1[i]) - (s32)(src2[i]); + v *= v; + + if ((s32)(0x7fFFffFFu - (u32)(v)) <= result) + { + return 0x7fFFffFF; //result already saturated + } + result += v; + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f64 diffNormL2(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 4) + { + for(; i <= size.width - 4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vmovq_n_f32(0.0f); + + for (; i <= limit; i += 4 ) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + float32x4_t vs1 = vld1q_f32(src1 + i); + float32x4_t vs2 = vld1q_f32(src2 + i); + + float32x4_t vd = vsubq_f32(vs2,vs1); + s = vmlaq_f32(s, vd, vd); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + } + + for (; i < size.width; i++) + { + f32 v = src1[i] - src2[i]; + result += v * v; + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0.; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/opticalflow.cpp b/3rdparty/carotene/src/opticalflow.cpp new file mode 100644 index 0000000000..fa9402a05c --- /dev/null +++ b/3rdparty/carotene/src/opticalflow.cpp @@ -0,0 +1,539 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "saturate_cast.hpp" +#include +#include // For FLT_EPSILON + +namespace CAROTENE_NS { + +#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) + +/* + * Pyramidal Lucas-Kanade Optical Flow level processing + */ +void pyrLKOptFlowLevel(const Size2D &size, s32 cn, + const u8 *prevData, ptrdiff_t prevStride, + const s16 *prevDerivData, ptrdiff_t prevDerivStride, + const u8 *nextData, ptrdiff_t nextStride, + u32 ptCount, + const f32 *prevPts, f32 *nextPts, + u8 *status, f32 *err, + const Size2D &winSize, + u32 terminationCount, f64 terminationEpsilon, + u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals, + f32 minEigThreshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f; + s32 cn2 = cn*2; + + std::vector _buf(winSize.total()*(cn + cn2)); + s16* IWinBuf = &_buf[0]; + s32 IWinBufStride = winSize.width*cn; + s16* derivIWinBuf = &_buf[winSize.total()*cn]; + s32 derivIWinBufStride = winSize.width*cn2; + + for( u32 ptidx = 0; ptidx < ptCount; ptidx++ ) + { + f32 levscale = (1./(1 << level)); + u32 ptref = ptidx << 1; + f32 prevPtX = prevPts[ptref+0]*levscale; + f32 prevPtY = prevPts[ptref+1]*levscale; + f32 nextPtX; + f32 nextPtY; + if( level == maxLevel ) + { + if( useInitialFlow ) + { + nextPtX = nextPts[ptref+0]*levscale; + nextPtY = nextPts[ptref+1]*levscale; + } + else + { + nextPtX = prevPtX; + nextPtY = prevPtY; + } + } + else + { + nextPtX = nextPts[ptref+0]*2.f; + nextPtY = nextPts[ptref+1]*2.f; + } + nextPts[ptref+0] = nextPtX; + nextPts[ptref+1] = nextPtY; + + s32 iprevPtX, iprevPtY; + s32 inextPtX, inextPtY; + prevPtX -= halfWinX; + prevPtY -= halfWinY; + iprevPtX = floor(prevPtX); + iprevPtY = floor(prevPtY); + + if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width || + iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = false; + if( err ) + err[ptidx] = 0; + } + continue; + } + + f32 a = prevPtX - iprevPtX; + f32 b = prevPtY - iprevPtY; + const s32 W_BITS = 14, W_BITS1 = 14; + const f32 FLT_SCALE = 1.f/(1 << 20); + s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS)); + s32 iw01 = round(a*(1.f - b)*(1 << W_BITS)); + s32 iw10 = round((1.f - a)*b*(1 << W_BITS)); + s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + s32 dstep = prevDerivStride/sizeof(s16); + f32 A11 = 0, A12 = 0, A22 = 0; + + int16x4_t viw00 = vmov_n_s16((s16)iw00); + int16x4_t viw01 = vmov_n_s16((s16)iw01); + int16x4_t viw10 = vmov_n_s16((s16)iw10); + int16x4_t viw11 = vmov_n_s16((s16)iw11); + + float32x4_t vA11 = vmovq_n_f32(0); + float32x4_t vA12 = vmovq_n_f32(0); + float32x4_t vA22 = vmovq_n_f32(0); + + s32 wwcn = winSize.width*cn; + + // extract the patch from the first image, compute covariation matrix of derivatives + s32 x = 0; + for(s32 y = 0; y < (s32)winSize.height; y++ ) + { + const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn; + const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2; + + s16* Iptr = IWinBuf + y*IWinBufStride; + s16* dIptr = derivIWinBuf + y*derivIWinBufStride; + + internal::prefetch(src + x + prevStride * 2, 0); + for(x = 0; x <= wwcn - 8; x += 8) + { + uint8x8_t vsrc00 = vld1_u8(src + x); + uint8x8_t vsrc10 = vld1_u8(src + x + prevStride); + uint8x8_t vsrc01 = vld1_u8(src + x + cn); + uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn); + + int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00)); + int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10)); + int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01)); + int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11)); + + int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00); + int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01); + + int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); + int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5); + + vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh)); + } + for(; x <= wwcn - 4; x += 4) + { + uint8x8_t vsrc00 = vld1_u8(src + x); + uint8x8_t vsrc10 = vld1_u8(src + x + prevStride); + uint8x8_t vsrc01 = vld1_u8(src + x + cn); + uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn); + + int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00))); + int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10))); + int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01))); + int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11))); + + int32x4_t vsuml1 = vmull_s16(vs00, viw00); + int32x4_t vsuml2 = vmull_s16(vs01, viw01); + vsuml1 = vmlal_s16(vsuml1, vs10, viw10); + vsuml2 = vmlal_s16(vsuml2, vs11, viw11); + int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2); + + int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); + + vst1_s16(Iptr + x, vsumnl); + } + + internal::prefetch(dsrc + dstep * 2, 0); + for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { +#if __GNUC_MINOR__ < 0 + __asm__ ( + "vld2.16 {d0-d1}, [%[dsrc00]] \n\t" + "vld2.16 {d2-d3}, [%[dsrc10]] \n\t" + "vld2.16 {d4-d5}, [%[dsrc01]] \n\t" + "vld2.16 {d6-d7}, [%[dsrc11]] \n\t" + "vmull.s16 q4, d3, %P[viw10] \n\t" + "vmull.s16 q5, d0, %P[viw00] \n\t" + "vmlal.s16 q4, d7, %P[viw11] \n\t" + "vmlal.s16 q5, d4, %P[viw01] \n\t" + "vmlal.s16 q4, d1, %P[viw00] \n\t" + "vmlal.s16 q5, d2, %P[viw10] \n\t" + "vmlal.s16 q4, d5, %P[viw01] \n\t" + "vmlal.s16 q5, d6, %P[viw11] \n\t" + "vrshrn.s32 d13, q4, %[W_BITS1] \n\t" + "vrshrn.s32 d12, q5, %[W_BITS1] \n\t" + "vmull.s16 q3, d13, d13 \n\t" + "vmull.s16 q4, d12, d12 \n\t" + "vmull.s16 q5, d13, d12 \n\t" + "vcvt.f32.s32 q3, q3 \n\t" + "vcvt.f32.s32 q4, q4 \n\t" + "vcvt.f32.s32 q5, q5 \n\t" + "vadd.f32 %q[vA22], q3 \n\t" + "vadd.f32 %q[vA11], q4 \n\t" + "vadd.f32 %q[vA12], q5 \n\t" + "vst2.16 {d12-d13}, [%[out]] \n\t" + : [vA22] "=w" (vA22), + [vA11] "=w" (vA11), + [vA12] "=w" (vA12) + : "0" (vA22), + "1" (vA11), + "2" (vA12), + [out] "r" (dIptr), + [dsrc00] "r" (dsrc), + [dsrc10] "r" (dsrc + dstep), + [dsrc01] "r" (dsrc + cn2), + [dsrc11] "r" (dsrc + dstep + cn2), + [viw00] "w" (viw00), + [viw10] "w" (viw10), + [viw01] "w" (viw01), + [viw11] "w" (viw11), + [W_BITS1] "I" (W_BITS1) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); +#else + int16x4x2_t vdsrc00 = vld2_s16(dsrc); + int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep); + int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2); + int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2); + + int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10); + int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00); + + vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11); + vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01); + + vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00); + vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10); + + vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01); + vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11); + + int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1); + int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1); + + int32x4_t va22i = vmull_s16(vsumny, vsumny); + int32x4_t va11i = vmull_s16(vsumnx, vsumnx); + int32x4_t va12i = vmull_s16(vsumnx, vsumny); + + float32x4_t va22f = vcvtq_f32_s32(va22i); + float32x4_t va11f = vcvtq_f32_s32(va11i); + float32x4_t va12f = vcvtq_f32_s32(va12i); + + vA22 = vaddq_f32(vA22, va22f); + vA11 = vaddq_f32(vA11, va11f); + vA12 = vaddq_f32(vA12, va12f); + + int16x4x2_t vsum; + vsum.val[0] = vsumnx; + vsum.val[1] = vsumny; + vst2_s16(dIptr, vsum); +#endif + } + + for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 ) + { + s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5); + s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + Iptr[x] = (s16)ival; + dIptr[0] = (s16)ixval; + dIptr[1] = (s16)iyval; + + A11 += (f32)(ixval*ixval); + A12 += (f32)(ixval*iyval); + A22 += (f32)(iyval*iyval); + } + } + + f32 A11buf[2], A12buf[2], A22buf[2]; + vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11))); + vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12))); + vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22))); + A11 += A11buf[0] + A11buf[1]; + A12 += A12buf[0] + A12buf[1]; + A22 += A22buf[0] + A22buf[1]; + + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + f32 D = A11*A22 - A12*A12; + f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winSize.width*winSize.height); + + if( err && getMinEigenVals ) + err[ptidx] = (f32)minEig; + + if( minEig < minEigThreshold || D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = false; + continue; + } + + D = 1.f/D; + + nextPtX -= halfWinX; + nextPtY -= halfWinY; + f32 prevDeltaX = 0; + f32 prevDeltaY = 0; + + for(u32 j = 0; j < terminationCount; j++ ) + { + inextPtX = floor(nextPtX); + inextPtY = floor(nextPtY); + + if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width || + inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height ) + { + if( level == 0 && status ) + status[ptidx] = false; + break; + } + + a = nextPtX - inextPtX; + b = nextPtY - inextPtY; + iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = round(a*(1.f - b)*(1 << W_BITS)); + iw10 = round((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + f32 b1 = 0, b2 = 0; + + viw00 = vmov_n_s16((s16)iw00); + viw01 = vmov_n_s16((s16)iw01); + viw10 = vmov_n_s16((s16)iw10); + viw11 = vmov_n_s16((s16)iw11); + + float32x4_t vb1 = vmovq_n_f32(0); + float32x4_t vb2 = vmovq_n_f32(0); + + for(s32 y = 0; y < (s32)winSize.height; y++ ) + { + const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn; + const s16* Iptr = IWinBuf + y*IWinBufStride; + const s16* dIptr = derivIWinBuf + y*derivIWinBufStride; + + x = 0; + + internal::prefetch(Jptr, nextStride * 2); + internal::prefetch(Iptr, IWinBufStride/2); + internal::prefetch(dIptr, derivIWinBufStride/2); + + for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 ) + { + uint8x8_t vj00 = vld1_u8(Jptr + x); + uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride); + uint8x8_t vj01 = vld1_u8(Jptr + x + cn); + uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn); + int16x8_t vI = vld1q_s16(Iptr + x); + int16x8x2_t vDerivI = vld2q_s16(dIptr); + + int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00)); + int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10)); + int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01)); + int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11)); + + int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00); + int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01); + + int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); + int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5); + + int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI); + + int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0])); + int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1])); + int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0])); + int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1])); + + float32x4_t vb1f = vcvtq_f32_s32(vb1i); + float32x4_t vb2f = vcvtq_f32_s32(vb2i); + + vb1 = vaddq_f32(vb1, vb1f); + vb2 = vaddq_f32(vb2, vb2f); + } + + for( ; x < wwcn; x++, dIptr += 2 ) + { + s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11, + W_BITS1-5) - Iptr[x]; + b1 += (f32)(diff*dIptr[0]); + b2 += (f32)(diff*dIptr[1]); + } + } + + f32 bbuf[2]; + float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2))); + vst1_f32(bbuf, vb); + b1 += bbuf[0]; + b2 += bbuf[1]; + + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; + + f32 deltaX = (f32)((A12*b2 - A22*b1) * D); + f32 deltaY = (f32)((A12*b1 - A11*b2) * D); + + nextPtX += deltaX; + nextPtY += deltaY; + nextPts[ptref+0] = nextPtX + halfWinX; + nextPts[ptref+1] = nextPtY + halfWinY; + + if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon ) + break; + + if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 && + std::abs(deltaY + prevDeltaY) < 0.01 ) + { + nextPts[ptref+0] -= deltaX*0.5f; + nextPts[ptref+1] -= deltaY*0.5f; + break; + } + prevDeltaX = deltaX; + prevDeltaY = deltaY; + } + + if( status && status[ptidx] && err && level == 0 && !getMinEigenVals ) + { + f32 nextPointX = nextPts[ptref+0] - halfWinX; + f32 nextPointY = nextPts[ptref+1] - halfWinY; + + s32 inextPointX = floor(nextPointX); + s32 inextPointY = floor(nextPointY); + + if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width || + inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height ) + { + if( status ) + status[ptidx] = false; + continue; + } + + f32 aa = nextPointX - inextPointX; + f32 bb = nextPointY - inextPointY; + iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS)); + iw01 = round(aa*(1.f - bb)*(1 << W_BITS)); + iw10 = round((1.f - aa)*bb*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + f32 errval = 0.f; + + for(s32 y = 0; y < (s32)winSize.height; y++ ) + { + const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn; + const s16* Iptr = IWinBuf + y*IWinBufStride; + + for( x = 0; x < wwcn; x++ ) + { + s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11, + W_BITS1-5) - Iptr[x]; + errval += std::abs((f32)diff); + } + } + err[ptidx] = errval / (32*wwcn*winSize.height); + } + } +#else + (void)size; + (void)cn; + (void)prevData; + (void)prevStride; + (void)prevDerivData; + (void)prevDerivStride; + (void)nextData; + (void)nextStride; + (void)prevPts; + (void)nextPts; + (void)status; + (void)err; + (void)winSize; + (void)terminationCount; + (void)terminationEpsilon; + (void)level; + (void)maxLevel; + (void)useInitialFlow; + (void)getMinEigenVals; + (void)minEigThreshold; + (void)ptCount; +#endif +} + +}//CAROTENE_NS + diff --git a/3rdparty/carotene/src/phase.cpp b/3rdparty/carotene/src/phase.cpp new file mode 100644 index 0000000000..141b1e864a --- /dev/null +++ b/3rdparty/carotene/src/phase.cpp @@ -0,0 +1,274 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include +#include + +#include "common.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +#define FASTATAN2CONST(scale) \ + f32 P1((f32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \ + P3((f32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \ + P5((f32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \ + P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \ + A_90((f32)(90.f * scale)), \ + A_180((f32)(180.f * scale)), \ + A_360((f32)(360.f * scale)); \ + float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \ + _90(vdupq_n_f32(A_90)), \ + _180(vdupq_n_f32(A_180)), \ + _360(vdupq_n_f32(A_360)), \ + z(vdupq_n_f32(0.0f)), \ + p1(vdupq_n_f32(P1)), \ + p3(vdupq_n_f32(P3)), \ + p5(vdupq_n_f32(P5)), \ + p7(vdupq_n_f32(P7)); + +#define FASTATAN2SCALAR(y, x, a) \ + { \ + f32 ax = std::abs(x), ay = std::abs(y); \ + f32 c, c2; \ + if (ax >= ay) \ + { \ + c = ay / (ax + (float)DBL_EPSILON); \ + c2 = c * c; \ + a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \ + } \ + else \ + { \ + c = ax / (ay + (float)DBL_EPSILON); \ + c2 = c * c; \ + a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \ + } \ + if (x < 0) \ + a = A_180 - a; \ + if (y < 0) \ + a = A_360 - a; \ + } + +#define FASTATAN2VECTOR(v_y, v_x, a) \ + { \ + float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \ + float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \ + float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \ + float32x4_t c2 = vmulq_f32(c, c); \ + a = vmulq_f32(c2, p7); \ + \ + a = vmulq_f32(vaddq_f32(a, p5), c2); \ + a = vmulq_f32(vaddq_f32(a, p3), c2); \ + a = vmulq_f32(vaddq_f32(a, p1), c); \ + \ + a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \ + a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \ + a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \ + \ + } + +} // namespace + +#endif + +void phase(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + FASTATAN2CONST(256.0f / 360.0f) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + float32x4_t v_05 = vdupq_n_f32(0.5f); + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + + // 0 + float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + float32x4_t v_dst32f0; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + float32x4_t v_dst32f1; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) + + uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + // 1 + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11))); + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11))); + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) + + uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0), + vmovn_u16(v_dst16s1))); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vld1q_s16(src0 + j); + int16x8_t v_src1 = vld1q_s16(src1 + j); + + float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0))); + float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))); + float32x4_t v_dst32f0; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))); + float32x4_t v_dst32f1; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) + + uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + vst1_u8(dst + j, vmovn_u16(v_dst)); + } + + for (; j < size.width; j++) + { + f32 x = src0[j], y = src1[j]; + f32 a; + FASTATAN2SCALAR(y, x, a) + dst[j] = (u8)(s32)floor(a + 0.5f); + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void phase(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + FASTATAN2CONST(scale) + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4); + float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4); + + float32x4_t v_dst32f; + // 0 + FASTATAN2VECTOR(v_src10, v_src00, v_dst32f) + vst1q_f32(dst + j, v_dst32f); + // 1 + FASTATAN2VECTOR(v_src11, v_src01, v_dst32f) + vst1q_f32(dst + j + 4, v_dst32f); + } + if(j + 4 <= size.width) + { + float32x4_t v_src0 = vld1q_f32(src0 + j); + float32x4_t v_src1 = vld1q_f32(src1 + j); + + float32x4_t v_dst32f; + FASTATAN2VECTOR(v_src1, v_src0, v_dst32f) + vst1q_f32(dst + j, v_dst32f); + j += 4; + } + + for (; j < size.width; j++) + { + f32 a; + FASTATAN2SCALAR(src1[j], src0[j], a) + dst[j] = a; + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)scale; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/pyramid.cpp b/3rdparty/carotene/src/pyramid.cpp new file mode 100644 index 0000000000..546ccecd97 --- /dev/null +++ b/3rdparty/carotene/src/pyramid.cpp @@ -0,0 +1,1414 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border_mode) +{ + if (!isSupportedConfiguration()) + return false; + // Need at least 8 pixels for vectorization. + // Need to make sure dst width is half the src width. + // Don't care about dst height. + if ( dstSize.width < 8 || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ) + return false; + + // Current implementation only supports Reflect101 (ie: UNDEFINED mode) + if (border_mode != BORDER_MODE_UNDEFINED) + return false; + + return true; +} + +bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (dstSize.width * cn) < 8 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || + std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) + return false; + + return true; +} + +bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (dstSize.width * cn) < 4 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || + std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) + return false; + + return true; +} + +bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (dstSize.width * cn) < 4 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || + std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) + return false; + + return true; +} + +bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (srcSize.width * cn) < 8 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 || + std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 ) + return false; + + return true; +} + +bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (srcSize.width * cn) < 12 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 || + std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 ) + return false; + + return true; +} + +#ifdef CAROTENE_NEON + +namespace { + +ptrdiff_t borderInterpolate101(ptrdiff_t p, ptrdiff_t len) +{ + if (len == 1) + return 0; + else + { + while ((unsigned)p >= (unsigned)len) + { + if (p < 0) + p = -p; + else + p = (len - 1)*2 - p; + } + } + return p; +} + +} // namespace + +#endif + +void gaussianPyramidDownRTZ(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isGaussianPyramidDownRTZSupported(srcSize, dstSize, border)); +#ifdef CAROTENE_NEON + // Single-core NEON code + const size_t dwidth = dstSize.width; + const size_t dheight = dstSize.height; + const size_t swidth = srcSize.width; + const size_t sheight = srcSize.height; + + ptrdiff_t idx_l1 = borderInterpolate101(-1, swidth); + ptrdiff_t idx_l2 = borderInterpolate101(-2, swidth); + ptrdiff_t idx_r1 = borderInterpolate101(swidth + 0, swidth); + ptrdiff_t idx_r2 = borderInterpolate101(swidth + 1, swidth); + + //1-line buffer + std::vector _buf((swidth + 4) + 32/sizeof(u16)); + u16* lane = internal::alignPtr(&_buf[2], 32); + + uint8x8_t vc6u8 = vmov_n_u8(6); + uint16x8_t vc6u16 = vmovq_n_u16(6); + uint16x8_t vc4u16 = vmovq_n_u16(4); + + u8* dst = dstBase; + + for (size_t i = 0; i < dheight; ++i, dst += dstStride) + { + //vertical convolution + const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, sheight)); + const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, sheight)); + const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, sheight)); + const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, sheight)); + const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, sheight)); + + size_t x = 0; + for (; x <= swidth - 8; x += 8) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + uint8x8_t v0 = vld1_u8(ln0+x); + uint8x8_t v1 = vld1_u8(ln1+x); + uint8x8_t v2 = vld1_u8(ln2+x); + uint8x8_t v3 = vld1_u8(ln3+x); + uint8x8_t v4 = vld1_u8(ln4+x); + + uint16x8_t v = vaddl_u8(v0, v4); + uint16x8_t v13 = vaddl_u8(v1, v3); + + v = vmlal_u8(v, v2, vc6u8); + v = vmlaq_u16(v, v13, vc4u16); + + vst1q_u16(lane + x, v); + } + for (; x < swidth; ++x) + { + lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x]; + } + + //left&right borders + lane[-1] = lane[idx_l1]; + lane[-2] = lane[idx_l2]; + + lane[swidth] = lane[idx_r1]; + lane[swidth+1] = lane[idx_r2]; + + //horizontal convolution + x = 0; + size_t vw = (swidth/2) - 7; // Using 7 instead of 8 allows swidth of 14 or 15. + for (; x < vw; x += 8) + { + internal::prefetch(lane + 2 * x); + uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2); // L0[0] = x0 x2 x4 x6 x8 x10 x12 x14 L0[1] = x1 x3 x5 x7 x9 x11 x13 x15 + uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1); // L1[0] = x1 x3 x5 x7 x9 x11 x13 x15 L1[1] = x2 x4 x6 x8 x10 x12 x14 x16 + uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0); // L2[0] = x2 x4 x6 x8 x10 x12 x14 x16 L2[1] = x3 x5 x7 x9 x11 x13 x15 x17 + uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1); // L3[0] = x3 x5 x7 x9 x11 x13 x15 x17 L3[1] = x4 x6 x8 x10 x12 x14 x16 x18 + uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2); // L4[0] = x4 x6 x8 x10 x12 x14 x16 x18 L4[1] = x5 x7 x9 x11 x13 x15 x17 x19 + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); + uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]); + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16); + uint8x8_t vRes = vshrn_n_u16(vSum_0_4, 8); + + vst1_u8(dst + x, vRes); + } + + for (; x < dwidth; x++) + { + dst[x] = u8((lane[2*x-2] + lane[2*x+2] + 4u * (lane[2*x-1] + lane[2*x+1]) + 6u * lane[2*x]) >> 8); + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcSize; + (void)srcBase; + (void)srcStride; + (void)dstSize; + (void)dstBase; + (void)dstStride; + (void)border; +#endif + (void)borderValue; +} + +void gaussianPyramidDown(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidDownU8Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolcn = dstSize.width*cn; + size_t scolcn = srcSize.width*cn; + size_t roiw8 = dcolcn - 7; + + size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; + size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; + size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; + size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; + + //1-line buffer + std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(u16)); + u16* lane = internal::alignPtr(&_buf[2*cn], 32); + + uint8x8_t vc6u8 = vmov_n_u8(6); + uint16x8_t vc6u16 = vmovq_n_u16(6); + uint16x8_t vc4u16 = vmovq_n_u16(4); + + for (size_t i = 0; i < dstSize.height; ++i) + { + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); + const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); + const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); + const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); + const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); + + size_t x = 0; + for (; x <= scolcn - 8; x += 8) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); + uint8x8_t v0 = vld1_u8(ln0+x); + uint8x8_t v1 = vld1_u8(ln1+x); + uint8x8_t v2 = vld1_u8(ln2+x); + uint8x8_t v3 = vld1_u8(ln3+x); + uint8x8_t v4 = vld1_u8(ln4+x); + + uint16x8_t v = vaddl_u8(v0, v4); + uint16x8_t v13 = vaddl_u8(v1, v3); + + v = vmlal_u8(v, v2, vc6u8); + v = vmlaq_u16(v, v13, vc4u16); + + vst1q_u16(lane + x, v); + } + for (; x < scolcn; ++x) + { + lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x]; + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane[(s32)(-cn+k)] = lane[idx_l1 + k]; + lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; + + lane[scolcn+k] = lane[idx_r1 + k]; + lane[scolcn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + for (; x < roiw8; x += 8) + { + internal::prefetch(lane + 2 * x); +#if __GNUC_MINOR__ < 7 + __asm__ ( + "vld2.16 {d0-d3}, [%[in0]] \n\t" + "vld2.16 {d4-d7}, [%[in4]] \n\t" + "vld2.16 {d12-d15}, [%[in1]] \n\t" + "vld2.16 {d16-d19}, [%[in3]] \n\t" + "vld2.16 {d8-d11}, [%[in2],:256] \n\t" + "vadd.i16 q0, q2 /*q0 = v0 + v4*/ \n\t" + "vadd.i16 q6, q8 /*q6 = v1 + v3*/ \n\t" + "vmla.i16 q0, q4, %q[c6] /*q0 += v2 * 6*/ \n\t" + "vmla.i16 q0, q6, %q[c4] /*q1 += (v1+v3) * 4*/ \n\t" + "vrshrn.u16 d8, q0, #8 \n\t" + "vst1.8 {d8}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x), + [in0] "r" (lane + 2*x-2), + [in1] "r" (lane + 2*x-1), + [in2] "r" (lane + 2*x+0), + [in3] "r" (lane + 2*x+1), + [in4] "r" (lane + 2*x+2), + [c4] "w" (vc4u16), [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); +#else + uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2); + uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1); + uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0); + uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1); + uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2); + + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); + uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]); + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16); + uint8x8_t vRes = vrshrn_n_u16(vSum_0_4, 8); + + vst1_u8(dst + x, vRes); +#endif + } + break; + case 3: + { + uint16x4_t vx1 = vld1_u16(lane - 2*3); + uint16x4_t vx2 = vld1_u16(lane - 1*3); + uint16x4_t vx3 = vld1_u16(lane + 0*3); + uint16x8_t v0 = vcombine_u16(vx1, vx3); + + uint8x8_t map = vreinterpret_u8_u64(vmov_n_u64(0xFFFF060504020100ULL)); + for (; x < roiw8; x += 6) + { + internal::prefetch(lane + 2 * x + 12); + + uint16x4_t vx_ = vld1_u16(lane + 2*x-1*3 + 6); + uint16x4_t vx4 = vld1_u16(lane + 2*x+0*3 + 6); + uint16x4_t vx5 = vld1_u16(lane + 2*x+1*3 + 6); + uint16x4_t vx6 = vld1_u16(lane + 2*x+2*3 + 6); + + uint16x8_t v1 = vcombine_u16(vx2, vx_); + uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4); + uint16x8_t v3 = vcombine_u16(vx_, vx5); + uint16x8_t v4 = vcombine_u16(vx4, vx6); + vx2 = vx5; + + uint16x8_t v = vaddq_u16(v0, v4); + uint16x8_t v13 = vaddq_u16(v1, v3); + + v = vmlaq_u16(v, v2, vc6u16); + v = vmlaq_u16(v, v13, vc4u16); + + uint8x8_t v8 = vrshrn_n_u16(v, 8); + + v0 = v4; + + vst1_u8(dst + x, vtbl1_u8(v8, map)); + } + } + break; + case 4: + { + uint16x4_t vx1 = vld1_u16(lane - 2*4); + uint16x4_t vx2 = vld1_u16(lane - 1*4); + uint16x4_t vx3 = vld1_u16(lane + 0*4); + uint16x8_t v0 = vcombine_u16(vx1, vx3); + + for (; x < roiw8; x += 8) + { + internal::prefetch(lane + 2 * x + 16); + + uint16x4_t vx_ = vld1_u16(lane + 2 * x - 1*4 + 8); + uint16x4_t vx4 = vld1_u16(lane + 2 * x + 0*4 + 8); + uint16x4_t vx5 = vld1_u16(lane + 2 * x + 1*4 + 8); + uint16x4_t vx6 = vld1_u16(lane + 2 * x + 2*4 + 8); + + uint16x8_t v1 = vcombine_u16(vx2, vx_); + uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4); + uint16x8_t v3 = vcombine_u16(vx_, vx5); + uint16x8_t v4 = vcombine_u16(vx4, vx6); + vx2 = vx5; + + uint16x8_t v = vaddq_u16(v0, v4); + uint16x8_t v13 = vaddq_u16(v1, v3); + + v = vmlaq_u16(v, v2, vc6u16); + v = vmlaq_u16(v, v13, vc4u16); + + uint8x8_t v8 = vrshrn_n_u16(v, 8); + + v0 = v4; + + vst1_u8(dst + x, v8); + } + } + break; + } + + for (u32 h = 0; h < cn; ++h) + { + u16* ln = lane + h; + u8* dt = dst + h; + for (size_t k = x; k < dcolcn; k += cn) + dt[k] = u8((ln[2*k-2*cn] + ln[2*k+2*cn] + 4u * (ln[2*k-cn] + ln[2*k+cn]) + 6u * ln[2*k] + (1 << 7)) >> 8); + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gaussianPyramidDown(const Size2D &srcSize, + const s16 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + s16 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidDownS16Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolcn = dstSize.width*cn; + size_t scolcn = srcSize.width*cn; + size_t roiw4 = dcolcn - 3; + + size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; + size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; + size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; + size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; + + //1-line buffer + std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(s32)); + s32* lane = internal::alignPtr(&_buf[2*cn], 32); + + int16x4_t vc6s16 = vmov_n_s16(6); + int32x4_t vc6s32 = vmovq_n_s32(6); + int32x4_t vc4s32 = vmovq_n_s32(4); + + for (size_t i = 0; i < dstSize.height; ++i) + { + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); + const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); + const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); + const s16* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); + const s16* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); + + size_t x = 0; + for (; x <= scolcn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); + int16x4_t v0 = vld1_s16(ln0 + x); + int16x4_t v1 = vld1_s16(ln1 + x); + int16x4_t v2 = vld1_s16(ln2 + x); + int16x4_t v3 = vld1_s16(ln3 + x); + int16x4_t v4 = vld1_s16(ln4 + x); + + int32x4_t v = vaddl_s16(v0, v4); + int32x4_t v13 = vaddl_s16(v1, v3); + + v = vmlal_s16(v, v2, vc6s16); + v = vmlaq_s32(v, v13, vc4s32); + + vst1q_s32(lane + x, v); + } + for (; x < scolcn; ++x) + { + lane[x] = ln0[x] + ln4[x] + 4 * (ln1[x] + ln3[x]) + 6 * ln2[x]; + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane[(s32)(-cn+k)] = lane[idx_l1 + k]; + lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; + + lane[scolcn+k] = lane[idx_r1 + k]; + lane[scolcn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + for (; x < roiw4; x += 4) + { + internal::prefetch(lane + 2 * x); +#if __GNUC_MINOR__ < 7 + __asm__ ( + "vld2.32 {d0-d3}, [%[in0]] \n\t" + "vld2.32 {d4-d7}, [%[in4]] \n\t" + "vld2.32 {d12-d15}, [%[in1]] \n\t" + "vld2.32 {d16-d19}, [%[in3]] \n\t" + "vld2.32 {d8-d11}, [%[in2],:256] \n\t" + "vadd.i32 q0, q2 \n\t" + "vadd.i32 q6, q8 \n\t" + "vmla.i32 q0, q4, %q[c6] \n\t" + "vmla.i32 q0, q6, %q[c4] \n\t" + "vrshrn.s32 d8, q0, #8 \n\t" + "vst1.16 {d8}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x), + [in0] "r" (lane + 2*x-2), + [in1] "r" (lane + 2*x-1), + [in2] "r" (lane + 2*x+0), + [in3] "r" (lane + 2*x+1), + [in4] "r" (lane + 2*x+2), + [c4] "w" (vc4s32), [c6] "w" (vc6s32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); +#else + int32x4x2_t vLane0 = vld2q_s32(lane + 2*x-2); + int32x4x2_t vLane1 = vld2q_s32(lane + 2*x-1); + int32x4x2_t vLane2 = vld2q_s32(lane + 2*x+0); + int32x4x2_t vLane3 = vld2q_s32(lane + 2*x+1); + int32x4x2_t vLane4 = vld2q_s32(lane + 2*x+2); + + int32x4_t vSum_0_4 = vaddq_s32(vLane0.val[0], vLane4.val[0]); + int32x4_t vSum_1_3 = vaddq_s32(vLane1.val[0], vLane3.val[0]); + vSum_0_4 = vmlaq_s32(vSum_0_4, vLane2.val[0], vc6s32); + vSum_0_4 = vmlaq_s32(vSum_0_4, vSum_1_3, vc4s32); + int16x4_t vRes = vrshrn_n_s32(vSum_0_4, 8); + + vst1_s16(dst + x, vRes); +#endif + } + break; + case 3: + { + int32x4_t v0 = vld1q_s32(lane - 2*3); + int32x4_t v1 = vld1q_s32(lane - 1*3); + int32x4_t v2 = vld1q_s32(lane + 0*3); + for (; x < roiw4; x += 3) + { + internal::prefetch(lane + 2 * x); + + int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*3); + int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*3); + + int32x4_t v = vaddq_s32(v0, v4); + int32x4_t v13 = vaddq_s32(v1, v3); + + v = vmlaq_s32(v, v2, vc6s32); + v = vmlaq_s32(v, v13, vc4s32); + + int16x4_t vv = vrshrn_n_s32(v, 8); + + v0 = v2; + v1 = v3; + v2 = v4; + + vst1_s16(dst + x, vv); + } + } + break; + case 4: + { + int32x4_t v0 = vld1q_s32(lane - 2*4); + int32x4_t v1 = vld1q_s32(lane - 1*4); + int32x4_t v2 = vld1q_s32(lane + 0*4); + for (; x < roiw4; x += 4) + { + internal::prefetch(lane + 2 * x + 8); + int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*4); + int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*4); + + int32x4_t v = vaddq_s32(v0, v4); + int32x4_t v13 = vaddq_s32(v1, v3); + + v = vmlaq_s32(v, v2, vc6s32); + v = vmlaq_s32(v, v13, vc4s32); + + int16x4_t vv = vrshrn_n_s32(v, 8); + + v0 = v2; + v1 = v3; + v2 = v4; + + vst1_s16(dst + x, vv); + } + } + break; + } + + for (u32 h = 0; h < cn; ++h) + { + s32* ln = lane + h; + s16* dt = dst + h; + for (size_t k = x; k < dcolcn; k += cn) + dt[k] = s16((ln[2*k-2*cn] + ln[2*k+2*cn] + 4 * (ln[2*k-cn] + ln[2*k+cn]) + 6 * ln[2*k] + (1 << 7)) >> 8); + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gaussianPyramidDown(const Size2D &srcSize, + const f32 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + f32 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidDownF32Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolcn = dstSize.width*cn; + size_t scolcn = srcSize.width*cn; + size_t roiw4 = dcolcn - 3; + + size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; + size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; + size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; + size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; + + //1-line buffer + std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(f32)); + f32* lane = internal::alignPtr(&_buf[2*cn], 32); + +#if __GNUC_MINOR__ < 7 + register float32x4_t vc6d4f32 asm ("q11") = vmovq_n_f32(1.5f); // 6/4 + register float32x4_t vc1d4f32 asm ("q12") = vmovq_n_f32(0.25f); // 1/4 + + register float32x4_t vc1d64f32 asm ("q13") = vmovq_n_f32(0.015625f); //1/4/16 + register float32x4_t vc4d64f32 asm ("q14") = vmovq_n_f32(0.0625f); //4/4/16 + register float32x4_t vc6d64f32 asm ("q15") = vmovq_n_f32(0.09375f); //6/4/16 +#else + register float32x4_t vc6d4f32 = vmovq_n_f32(1.5f); // 6/4 + register float32x4_t vc1d4f32 = vmovq_n_f32(0.25f); // 1/4 + + register float32x4_t vc1d64f32 = vmovq_n_f32(0.015625f); //1/4/16 + register float32x4_t vc4d64f32 = vmovq_n_f32(0.0625f); //4/4/16 + register float32x4_t vc6d64f32 = vmovq_n_f32(0.09375f); //6/4/16 +#endif + + for (size_t i = 0; i < dstSize.height; ++i) + { + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + const f32* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); + const f32* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); + const f32* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); + const f32* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); + const f32* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); + + size_t x = 0; + for (; x <= scolcn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); + float32x4_t v0 = vld1q_f32((const float32_t*)ln0 + x); + float32x4_t v1 = vld1q_f32((const float32_t*)ln1 + x); + float32x4_t v2 = vld1q_f32((const float32_t*)ln2 + x); + float32x4_t v3 = vld1q_f32((const float32_t*)ln3 + x); + float32x4_t v4 = vld1q_f32((const float32_t*)ln4 + x); + + float32x4_t v = vaddq_f32(v1, v3); + float32x4_t v04 = vaddq_f32(v0, v4); + + v = vmlaq_f32(v, v2, vc6d4f32); + v = vmlaq_f32(v, v04, vc1d4f32); + + vst1q_f32(lane + x, v); + } + for (; x < scolcn; ++x) + { + lane[x] = 0.25f*(ln0[x] + ln4[x]) + (ln1[x] + ln3[x]) + 1.5f * ln2[x]; + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane[(s32)(-cn+k)] = lane[idx_l1 + k]; + lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; + + lane[scolcn+k] = lane[idx_r1 + k]; + lane[scolcn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + for (; x < roiw4; x += 4) + { + internal::prefetch(lane + 2 * x); +#if __GNUC_MINOR__ < 7 + __asm__ __volatile__ ( + "vld2.32 {d0-d3}, [%[in0]] \n\t" + "vld2.32 {d8-d11}, [%[in4]] \n\t" + "vld2.32 {d14-d17}, [%[in2],:256] \n\t" + "vld2.32 {d10-d13}, [%[in1]] \n\t" + "vld2.32 {d16-d19}, [%[in3]] \n\t" + "vmul.f32 q7, %q[c6d64] \n\t" + "vadd.f32 q0, q4 @v04 \n\t" + "vadd.f32 q5, q8 @v13 \n\t" + "vmla.f32 q7, q0, %q[c1d64] \n\t" + "vmla.f32 q7, q5, %q[c4d64] \n\t" + "vst1.32 {d14-d15}, [%[out]] \n\t" + : + : [out] "r" (dst + x), + [in0] "r" (lane + 2*x-2), + [in1] "r" (lane + 2*x-1), + [in2] "r" (lane + 2*x+0), + [in3] "r" (lane + 2*x+1), + [in4] "r" (lane + 2*x+2), + [c4d64] "w" (vc4d64f32), [c6d64] "w" (vc6d64f32), [c1d64] "w" (vc1d64f32) + : "d0","d1","d2","d3","d4",/*"d5","d6","d7",*/"d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" //ugly compiler "bug" - can't touch d5-d7 + ); +#else + float32x4x2_t vLane0 = vld2q_f32(lane + 2*x-2); + float32x4x2_t vLane1 = vld2q_f32(lane + 2*x-1); + float32x4x2_t vLane2 = vld2q_f32(lane + 2*x+0); + float32x4x2_t vLane3 = vld2q_f32(lane + 2*x+1); + float32x4x2_t vLane4 = vld2q_f32(lane + 2*x+2); + + float32x4_t vSum_0_4 = vaddq_f32(vLane0.val[0], vLane4.val[0]); + float32x4_t vSum_1_3 = vaddq_f32(vLane1.val[0], vLane3.val[0]); + float32x4_t vRes = vmulq_f32(vLane2.val[0], vc6d64f32); + vRes = vmlaq_f32(vRes, vSum_0_4, vc1d64f32); + vRes = vmlaq_f32(vRes, vSum_1_3, vc4d64f32); + + vst1q_f32(dst + x, vRes); +#endif + } + break; + case 3: + { + float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*3); + float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*3); + float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*3); + + for (; x < roiw4; x += 3) + { + internal::prefetch(lane + 2 * x); + + float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*3); + float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*3); + + float32x4_t v04 = vaddq_f32(v0, v4); + float32x4_t v13 = vaddq_f32(v1, v3); + + float32x4_t v = vmulq_f32(v2, vc6d64f32); + v = vmlaq_f32(v, v04, vc1d64f32); + v = vmlaq_f32(v, v13, vc4d64f32); + + v0 = v2; + v1 = v3; + v2 = v4; + + vst1q_f32(dst + x, v); + } + } + break; + case 4: + { + float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*4); + float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*4); + float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*4); + + for (; x < roiw4; x += 4) + { + internal::prefetch(lane + 2 * x + 8); + + float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*4); + float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*4); + + float32x4_t v04 = vaddq_f32(v0, v4); + float32x4_t v13 = vaddq_f32(v1, v3); + + float32x4_t v = vmulq_f32(v2, vc6d64f32); + v = vmlaq_f32(v, v04, vc1d64f32); + v = vmlaq_f32(v, v13, vc4d64f32); + + v0 = v2; + v1 = v3; + v2 = v4; + + vst1q_f32(dst + x, v); + } + } + break; + } + + for (u32 h = 0; h < cn; ++h) + { + f32* ln = lane + h; + f32* dt = dst + h; + for (size_t k = x; k < dcolcn; k += cn) + dt[k] = 0.015625f * (ln[2*k-2*cn] + ln[2*k+2*cn]) + 0.0625f * (ln[2*k-cn] + ln[2*k+cn]) + 0.09375f * ln[2*k]; + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gaussianPyramidUp(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidUpU8Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolshn = (dstSize.width/2) * cn; + size_t dcolshw = ((dstSize.width+1)/2) * cn; + size_t scolsn = srcSize.width*cn; + + size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn; + size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn; + size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn; + + //2-lines buffer + std::vector _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(u16))); + u16* lane0 = internal::alignPtr(&_buf[cn], 32); + u16* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32); + + uint8x8_t vc6u8 = vmov_n_u8(6); + uint16x8_t vc6u16 = vmovq_n_u16(6); + + for (size_t i = 0; i < (dstSize.height + 1)/2; ++i) + { + u8* dst = internal::getRowPtr(dstBase, dstStride, 2*i); + //vertical convolution + const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2); + const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2); + const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2); + + size_t x = 0; + for (; x <= scolsn - 8; x += 8) + { + internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1)); + uint8x8_t v0 = vld1_u8(ln0+x); + uint8x8_t v2 = vld1_u8(ln2+x); + uint8x8_t v1 = vld1_u8(ln1+x); + + uint16x8_t vl0 = vaddl_u8(v0, v2); + uint16x8_t vl1 = vaddl_u8(v1, v2); + + vl0 = vmlal_u8(vl0, v1, vc6u8); + vl1 = vshlq_n_u16(vl1, 2); + + vst1q_u16(lane0 + x, vl0); + vst1q_u16(lane1 + x, vl1); + } + for (; x < scolsn; ++x) + { + lane0[x] = ln0[x] + ln2[x] + 6u * ln1[x]; + lane1[x] = 4u * (ln1[x] + ln2[x]); + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane0[(s32)(-cn+k)] = lane0[idx_l + k]; + lane1[(s32)(-cn+k)] = lane1[idx_l + k]; + + lane0[scolsn+k] = lane0[idx_r1 + k]; + lane0[scolsn+cn+k] = lane0[idx_r2 + k]; + lane1[scolsn+k] = lane1[idx_r1 + k]; + lane1[scolsn+cn+k] = lane1[idx_r2 + k]; + } + + //horizontal convolution + const u16* lane = lane0; +pyrUp8uHorizontalConvolution: + x = 0; + size_t lim; + switch(cn) + { + case 1: + lim = dcolshn > 7 ? dcolshn - 7 : 0; + for (; x < lim; x += 8) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" + "vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" + "vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" + "vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t" + "vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" + "vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" + "vrshrn.u16 d9, q3, #4 \n\t" + "vrshrn.u16 d8, q0, #6 \n\t" + "vst2.8 {d8-d9}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x*2), + [in0] "r" (lane + x - 1), + [in1] "r" (lane + x + 0), + [in2] "r" (lane + x + 1), + [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + uint16x8_t vLane0 = vld1q_u16(lane + x - 1); + uint16x8_t vLane1 = vld1q_u16(lane + x + 0); + uint16x8_t vLane2 = vld1q_u16(lane + x + 1); + + vLane0 = vaddq_u16(vLane0, vLane2); + vLane2 = vaddq_u16(vLane2, vLane1); + vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16); + uint8x8x2_t vRes; + vRes.val[0] = vrshrn_n_u16(vLane0, 6); + vRes.val[1] = vrshrn_n_u16(vLane2, 4); + + vst2_u8(dst + x*2, vRes); +#endif + } + break; + case 3: + { + lim = dcolshn > 23 ? dcolshn - 23 : 0; + for (; x < lim; x += 24) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vmov.u16 q9, #6 \n\t" + "vld3.16 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t" + "vld3.16 {d1, d3, d5}, [%[in02]] \n\t" + "vld3.16 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t" + "vld3.16 {d7, d9, d11}, [%[in22]] \n\t" + "vld3.16 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t" + "vld3.16 {d13, d15, d17}, [%[in12]] \n\t" + "vadd.i16 q0, q3 /*v0 + v2*/ \n\t" + "vadd.i16 q1, q4 /*v0 + v2*/ \n\t" + "vadd.i16 q2, q5 /*v0 + v2*/ \n\t" + "vadd.i16 q3, q6 /*v1 + v2*/ \n\t" + "vadd.i16 q4, q7 /*v1 + v2*/ \n\t" + "vadd.i16 q5, q8 /*v1 + v2*/ \n\t" + "vmla.i16 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t" + "vmla.i16 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t" + "vmla.i16 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t" + "vrshrn.u16 d19, q3, #4 \n\t" + "vrshrn.u16 d21, q4, #4 \n\t" + "vrshrn.u16 d23, q5, #4 \n\t" + "vrshrn.u16 d18, q0, #6 \n\t" + "vrshrn.u16 d20, q1, #6 \n\t" + "vrshrn.u16 d22, q2, #6 \n\t" + "vzip.8 d18, d19 \n\t" + "vzip.8 d20, d21 \n\t" + "vzip.8 d22, d23 \n\t" + "vst3.8 {d18, d20, d22}, [%[out1]] \n\t" + "vst3.8 {d19, d21, d23}, [%[out2]] \n\t" + : /*no output*/ + : [out1] "r" (dst + 2 * x), + [out2] "r" (dst + 2 * x + 24), + [in0] "r" (lane + x - 3), + [in02] "r" (lane + x + 9), + [in1] "r" (lane + x), + [in12] "r" (lane + x + 12), + [in2] "r" (lane + x + 3), + [in22] "r" (lane + x + 15) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + uint16x8_t vc6 = vmovq_n_u16(6); + uint16x8x3_t vLane0 = vld3q_u16(lane + x - 3); + uint16x8x3_t vLane1 = vld3q_u16(lane + x + 0); + uint16x8x3_t vLane2 = vld3q_u16(lane + x + 3); + + uint16x8_t vSum_0_3 = vaddq_u16(vLane0.val[0], vLane2.val[0]); + uint16x8_t vSum_1_4 = vaddq_u16(vLane0.val[1], vLane2.val[1]); + uint16x8_t vSum_2_5 = vaddq_u16(vLane0.val[2], vLane2.val[2]); + uint16x8_t vSum_3_6 = vaddq_u16(vLane2.val[0], vLane1.val[0]); + uint16x8_t vSum_4_7 = vaddq_u16(vLane2.val[1], vLane1.val[1]); + uint16x8_t vSum_5_8 = vaddq_u16(vLane2.val[2], vLane1.val[2]); + + vSum_0_3 = vmlaq_u16(vSum_0_3, vLane1.val[0], vc6); + vSum_1_4 = vmlaq_u16(vSum_1_4, vLane1.val[1], vc6); + vSum_2_5 = vmlaq_u16(vSum_2_5, vLane1.val[2], vc6); + + uint8x8x2_t vSumShr3; + vSumShr3.val[0] = vrshrn_n_u16(vSum_3_6, 4); + vSumShr3.val[1] = vrshrn_n_u16(vSum_0_3, 6);; + uint8x8x2_t vSumShr4; + vSumShr4.val[0] = vrshrn_n_u16(vSum_4_7, 4); + vSumShr4.val[1] = vrshrn_n_u16(vSum_1_4, 6); + uint8x8x2_t vSumShr5; + vSumShr5.val[0] = vrshrn_n_u16(vSum_5_8, 4); + vSumShr5.val[1] = vrshrn_n_u16(vSum_2_5, 6); + + vSumShr3 = vzip_u8(vSumShr3.val[1], vSumShr3.val[0]); + vSumShr4 = vzip_u8(vSumShr4.val[1], vSumShr4.val[0]); + vSumShr5 = vzip_u8(vSumShr5.val[1], vSumShr5.val[0]); + + uint8x8x3_t vRes1; + vRes1.val[0] = vSumShr3.val[0]; + vRes1.val[1] = vSumShr4.val[0]; + vRes1.val[2] = vSumShr5.val[0]; + vst3_u8(dst + 2 * x, vRes1); + + uint8x8x3_t vRes2; + vRes2.val[0] = vSumShr3.val[1]; + vRes2.val[1] = vSumShr4.val[1]; + vRes2.val[2] = vSumShr5.val[1]; + vst3_u8(dst + 2 * x + 24, vRes2); +#endif + } + } + break; + case 4: + lim = dcolshn > 7 ? dcolshn - 7 : 0; + for (; x < lim; x += 8) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" + "vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" + "vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" + "vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t" + "vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" + "vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" + "vrshrn.u16 d9, q3, #4 \n\t" + "vrshrn.u16 d8, q0, #6 \n\t" + "vst2.32 {d8-d9}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x*2), + [in0] "r" (lane + x-4), + [in1] "r" (lane + x), + [in2] "r" (lane + x+4), + [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + uint16x8_t vLane0 = vld1q_u16(lane + x-4); + uint16x8_t vLane1 = vld1q_u16(lane + x+0); + uint16x8_t vLane2 = vld1q_u16(lane + x+4); + + vLane0 = vaddq_u16(vLane0, vLane2); + vLane2 = vaddq_u16(vLane2, vLane1); + vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16); + uint32x2x2_t vRes; + vRes.val[1] = vreinterpret_u32_u8(vrshrn_n_u16(vLane2, 4)); + vRes.val[0] = vreinterpret_u32_u8(vrshrn_n_u16(vLane0, 6)); + + vst2_u32((uint32_t*)(dst + x*2), vRes); +#endif + } + break; + }; + + for (u32 h = 0; h < cn; ++h) + { + const u16* ln = lane + h; + u8* dt = dst + h; + size_t k = x; + for (; k < dcolshn; k += cn) + { + dt[2*k+0] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6); + dt[2*k+cn] = u8(((ln[k] + ln[k+cn]) * 4u + (1 << 5)) >> 6); + } + for (; k < dcolshw; k += cn) + dt[2*k] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6); + } + dst = internal::getRowPtr(dstBase, dstStride, 2*i+1); + + //second row + if (lane == lane0 && 2*i+1 < dstSize.height) + { + lane = lane1; + goto pyrUp8uHorizontalConvolution; + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gaussianPyramidUp(const Size2D &srcSize, + const s16 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + s16 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidUpS16Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolshn = (dstSize.width/2) * cn; + size_t dcolshw = ((dstSize.width+1)/2) * cn; + size_t scolsn = srcSize.width*cn; + + size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn; + size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn; + size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn; + + //2-lines buffer + std::vector _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(s32))); + s32* lane0 = internal::alignPtr(&_buf[cn], 32); + s32* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32); + + int16x4_t vc6s16 = vmov_n_s16(6); + int32x4_t vc6s32 = vmovq_n_s32(6); + + for (size_t i = 0; i < (dstSize.height + 1)/2; ++i) + { + s16* dst = internal::getRowPtr(dstBase, dstStride, 2*i); + //vertical convolution + const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2); + const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2); + const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2); + + size_t x = 0; + for (; x <= scolsn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1)); + int16x4_t v0 = vld1_s16(ln0 + x); + int16x4_t v2 = vld1_s16(ln2 + x); + int16x4_t v1 = vld1_s16(ln1 + x); + + int32x4_t vl0 = vaddl_s16(v0, v2); + int32x4_t vl1 = vaddl_s16(v1, v2); + + vl0 = vmlal_s16(vl0, v1, vc6s16); + vl1 = vshlq_n_s32(vl1, 2); + + vst1q_s32(lane0 + x, vl0); + vst1q_s32(lane1 + x, vl1); + } + for (; x < scolsn; ++x) + { + lane0[x] = ln0[x] + ln2[x] + 6 * ln1[x]; + lane1[x] = 4 * (ln1[x] + ln2[x]); + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane0[(s32)(-cn+k)] = lane0[idx_l + k]; + lane1[(s32)(-cn+k)] = lane1[idx_l + k]; + + lane0[scolsn+k] = lane0[idx_r1 + k]; + lane0[scolsn+cn+k] = lane0[idx_r2 + k]; + lane1[scolsn+k] = lane1[idx_r1 + k]; + lane1[scolsn+cn+k] = lane1[idx_r2 + k]; + } + + //horizontal convolution + const s32* lane = lane0; +pyrUp16sHorizontalConvolution: + x = 0; + size_t lim; + switch(cn) + { + case 1: + lim = dcolshn > 3 ? dcolshn - 3 : 0; + for (; x < lim; x += 4) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" + "vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" + "vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" + "vadd.i32 q0, q0, q1 /*q0 = v0 + v2*/ \n\t" + "vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" + "vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" + "vrshrn.s32 d9, q3, #4 \n\t" + "vrshrn.s32 d8, q0, #6 \n\t" + "vst2.16 {d8-d9}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x * 2), + [in0] "r" (lane + x - 1), + [in1] "r" (lane + x), + [in2] "r" (lane + x + 1), + [c6] "w" (vc6s32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + int32x4_t vLane0 = vld1q_s32(lane + x - 1); + int32x4_t vLane1 = vld1q_s32(lane + x); + int32x4_t vLane2 = vld1q_s32(lane + x + 1); + + vLane0 = vaddq_s32(vLane0, vLane2); + vLane2 = vaddq_s32(vLane2, vLane1); + vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32); + int16x4x2_t vRes; + vRes.val[0] = vrshrn_n_s32(vLane0, 6); + vRes.val[1] = vrshrn_n_s32(vLane2, 4); + + vst2_s16(dst + x * 2, vRes); +#endif + } + break; + case 3: + { + lim = dcolshn > 11 ? dcolshn - 11 : 0; + for (; x < lim; x += 12) + { + internal::prefetch(lane + x + 3); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vmov.s32 q9, #6 \n\t" + "vld3.32 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t" + "vld3.32 {d1, d3, d5}, [%[in2]] \n\t" + "vld3.32 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t" + "vld3.32 {d7, d9, d11}, [%[in22]] \n\t" + "vld3.32 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t" + "vld3.32 {d13, d15, d17}, [%[in12]] \n\t" + "vadd.i32 q0, q3 /*v0 + v2*/ \n\t" + "vadd.i32 q1, q4 /*v0 + v2*/ \n\t" + "vadd.i32 q2, q5 /*v0 + v2*/ \n\t" + "vadd.i32 q3, q6 /*v1 + v2*/ \n\t" + "vadd.i32 q4, q7 /*v1 + v2*/ \n\t" + "vadd.i32 q5, q8 /*v1 + v2*/ \n\t" + "vmla.i32 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t" + "vmla.i32 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t" + "vmla.i32 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t" + "vrshrn.s32 d19, q3, #4 \n\t" + "vrshrn.s32 d21, q4, #4 \n\t" + "vrshrn.s32 d23, q5, #4 \n\t" + "vrshrn.s32 d18, q0, #6 \n\t" + "vrshrn.s32 d20, q1, #6 \n\t" + "vrshrn.s32 d22, q2, #6 \n\t" + "vzip.16 d18, d19 \n\t" + "vzip.16 d20, d21 \n\t" + "vzip.16 d22, d23 \n\t" + "vst3.16 {d18, d20, d22}, [%[out1]] \n\t" + "vst3.16 {d19, d21, d23}, [%[out2]] \n\t" + : /*no output*/ + : [out1] "r" (dst + 2*x), + [out2] "r" (dst + 2*x + 12), + [in0] "r" (lane + x - 3), + [in1] "r" (lane + x), + [in12] "r" (lane + x + 6), + [in2] "r" (lane + x + 3), + [in22] "r" (lane + x + 9) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + int32x4_t vc6 = vmovq_n_s32(6); + int32x4x3_t vLane0 = vld3q_s32(lane + x - 3); + int32x4x3_t vLane1 = vld3q_s32(lane + x); + int32x4x3_t vLane2 = vld3q_s32(lane + x + 3); + + int32x4_t vSum_0_3 = vaddq_s32(vLane0.val[0], vLane2.val[0]); + int32x4_t vSum_1_4 = vaddq_s32(vLane0.val[1], vLane2.val[1]); + int32x4_t vSum_2_5 = vaddq_s32(vLane0.val[2], vLane2.val[2]); + int32x4_t vSum_3_6 = vaddq_s32(vLane2.val[0], vLane1.val[0]); + int32x4_t vSum_4_7 = vaddq_s32(vLane2.val[1], vLane1.val[1]); + int32x4_t vSum_5_8 = vaddq_s32(vLane2.val[2], vLane1.val[2]); + + vSum_0_3 = vmlaq_s32(vSum_0_3, vLane1.val[0], vc6); + vSum_1_4 = vmlaq_s32(vSum_1_4, vLane1.val[1], vc6); + vSum_2_5 = vmlaq_s32(vSum_2_5, vLane1.val[2], vc6); + + int16x4x2_t vSumShr1; + vSumShr1.val[1] = vrshrn_n_s32(vSum_3_6, 4); + vSumShr1.val[0] = vrshrn_n_s32(vSum_0_3, 6); + + int16x4x2_t vSumShr2; + vSumShr2.val[1] = vrshrn_n_s32(vSum_4_7, 4); + vSumShr2.val[0] = vrshrn_n_s32(vSum_1_4, 6); + + int16x4x2_t vSumShr3; + vSumShr3.val[1] = vrshrn_n_s32(vSum_5_8, 4); + vSumShr3.val[0] = vrshrn_n_s32(vSum_2_5, 6); + + vSumShr1 = vzip_s16(vSumShr1.val[0], vSumShr1.val[1]); + vSumShr2 = vzip_s16(vSumShr2.val[0], vSumShr2.val[1]); + vSumShr3 = vzip_s16(vSumShr3.val[0], vSumShr3.val[1]); + + int16x4x3_t vRes1; + vRes1.val[0] = vSumShr1.val[0]; + vRes1.val[1] = vSumShr2.val[0]; + vRes1.val[2] = vSumShr3.val[0]; + vst3_s16((int16_t*)(dst + 2 * x), vRes1); + + int16x4x3_t vRes2; + vRes2.val[0] = vSumShr1.val[1]; + vRes2.val[1] = vSumShr2.val[1]; + vRes2.val[2] = vSumShr3.val[1]; + vst3_s16(dst + 2 * x + 12, vRes2); +#endif + } + } + break; + case 4: + lim = dcolshn > 3 ? dcolshn - 3 : 0; + for (; x < lim; x += 4) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" + "vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" + "vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" + "vadd.i32 q0, q1 /*q0 = v0 + v2*/ \n\t" + "vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" + "vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" + "vrshrn.s32 d9, q3, #4 \n\t" + "vrshrn.s32 d8, q0, #6 \n\t" + "vst1.16 {d8-d9}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x * 2), + [in0] "r" (lane + x - 4), + [in1] "r" (lane + x), + [in2] "r" (lane + x + 4), + [c6] "w" (vc6s32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + int32x4_t vLane0 = vld1q_s32(lane + x - 4); + int32x4_t vLane1 = vld1q_s32(lane + x); + int32x4_t vLane2 = vld1q_s32(lane + x + 4); + + vLane0 = vaddq_s32(vLane0, vLane2); + vLane2 = vaddq_s32(vLane2, vLane1); + vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32); + int16x4x2_t vRes; + vRes.val[0] = vrshrn_n_s32(vLane0, 6); + vRes.val[1] = vrshrn_n_s32(vLane2, 4); + + vst1q_s16(dst + x * 2, vcombine_s16(vRes.val[0], vRes.val[1])); +#endif + } + break; + }; + + for (u32 h = 0; h < cn; ++h) + { + const s32* ln = lane + h; + s16* dt = dst + h; + size_t k = x; + for (; k < dcolshn; k += cn) + { + dt[2*k+0] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6); + dt[2*k+cn] = s16(((ln[k] + ln[k+cn]) * 4 + (1 << 5)) >> 6); + } + for (; k < dcolshw; k += cn) + dt[2*k] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6); + } + dst = internal::getRowPtr(dstBase, dstStride, 2*i+1); + + //second row + if (lane == lane0 && 2*i+1 < dstSize.height) + { + lane = lane1; + goto pyrUp16sHorizontalConvolution; + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/reduce.cpp b/3rdparty/carotene/src/reduce.cpp new file mode 100644 index 0000000000..8c11c39e80 --- /dev/null +++ b/3rdparty/carotene/src/reduce.cpp @@ -0,0 +1,460 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +void reduceColSum(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memset(dstBase, 0, size.width*sizeof(s32)); + size_t i = 0; + for (; i + 16 <= size.width; i += 16) + { + const u8* src_address = srcBase + i; + + int32x4_t sll = vmovq_n_s32(0); + int32x4_t slh = vmovq_n_s32(0); + int32x4_t shl = vmovq_n_s32(0); + int32x4_t shh = vmovq_n_s32(0); + + for (size_t h = 0; h < size.height; h += 256) + { + size_t lim = std::min(h + 256, size.height); + + uint16x8_t sl = vmovq_n_u16(0); + uint16x8_t sh = vmovq_n_u16(0); + + for (size_t k = h; k < lim; ++k, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + + uint8x16_t v = vld1q_u8(src_address); + + sl = vaddw_u8(sl, vget_low_u8(v)); + sh = vaddw_u8(sh, vget_high_u8(v)); + } + + int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl))); + int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl))); + int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh))); + int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh))); + + sll = vqaddq_s32(sll, vsll); + slh = vqaddq_s32(slh, vslh); + shl = vqaddq_s32(shl, vshl); + shh = vqaddq_s32(shh, vshh); + } + + vst1q_s32(dstBase + i + 0, sll); + vst1q_s32(dstBase + i + 4, slh); + vst1q_s32(dstBase + i + 8, shl); + vst1q_s32(dstBase + i + 12, shh); + } + + for(size_t h = 0; h < size.height; ++h) + { + for(size_t j = i ; j < size.width; j++ ) + { + if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu) + dstBase[j] = 0x7fFFffFF; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColMax(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width); + size_t i = 0; + for (; i + 16*4 <= size.width; i += 16*4) + { + const u8* src_address = srcBase + i; + + uint8x16_t s1 = vld1q_u8(src_address + 0); + uint8x16_t s2 = vld1q_u8(src_address + 16); + uint8x16_t s3 = vld1q_u8(src_address + 32); + uint8x16_t s4 = vld1q_u8(src_address + 48); + + src_address += srcStride; + + for(size_t h = 1; h < size.height; ++h, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + internal::prefetch(src_address + srcStride, 32); + + uint8x16_t v1 = vld1q_u8(src_address + 0); + uint8x16_t v2 = vld1q_u8(src_address + 16); + uint8x16_t v3 = vld1q_u8(src_address + 32); + uint8x16_t v4 = vld1q_u8(src_address + 48); + + s1 = vmaxq_u8(s1, v1); + s2 = vmaxq_u8(s2, v2); + s3 = vmaxq_u8(s3, v3); + s4 = vmaxq_u8(s4, v4); + } + + vst1q_u8(dstBase + i + 0, s1); + vst1q_u8(dstBase + i + 16, s2); + vst1q_u8(dstBase + i + 32, s3); + vst1q_u8(dstBase + i + 48, s4); + } + + for (; i + 16 <= size.width; i += 16) + { + const u8* src_address = srcBase + i; + uint8x16_t s1 = vld1q_u8(src_address); + src_address += srcStride; + for(size_t h = 1; h < size.height; ++h, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + + uint8x16_t v1 = vld1q_u8(src_address); + s1 = vmaxq_u8(s1, v1); + } + vst1q_u8(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + for(size_t j = i ; j < size.width; j++ ) + dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColMin(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width); + size_t i = 0; + for (; i + 16*4 <= size.width; i += 16*4) + { + const u8* src_address = srcBase + i; + + uint8x16_t s1 = vld1q_u8(src_address + 0); + uint8x16_t s2 = vld1q_u8(src_address + 16); + uint8x16_t s3 = vld1q_u8(src_address + 32); + uint8x16_t s4 = vld1q_u8(src_address + 48); + + src_address += srcStride; + + for(size_t h = 1; h < size.height; ++h, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + internal::prefetch(src_address + srcStride, 32); + + uint8x16_t v1 = vld1q_u8(src_address + 0); + uint8x16_t v2 = vld1q_u8(src_address + 16); + uint8x16_t v3 = vld1q_u8(src_address + 32); + uint8x16_t v4 = vld1q_u8(src_address + 48); + + s1 = vminq_u8(s1, v1); + s2 = vminq_u8(s2, v2); + s3 = vminq_u8(s3, v3); + s4 = vminq_u8(s4, v4); + } + + vst1q_u8(dstBase + i + 0, s1); + vst1q_u8(dstBase + i + 16, s2); + vst1q_u8(dstBase + i + 32, s3); + vst1q_u8(dstBase + i + 48, s4); + } + + for (; i + 16 <= size.width; i += 16) + { + const u8* src_address = srcBase + i; + uint8x16_t s1 = vld1q_u8(src_address); + src_address += srcStride; + for(size_t h = 1; h < size.height; ++h, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + + uint8x16_t v1 = vld1q_u8(src_address); + s1 = vminq_u8(s1, v1); + } + vst1q_u8(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + for(size_t j = i ; j < size.width; j++ ) + dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColSum(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width*sizeof(f32)); + size_t srcstep = srcStride/sizeof(f32); + size_t i = 0; + for (; i + 16 <= size.width; i += 16) + { + const f32* src_address = srcBase + i; + + float32x4_t s1 = vld1q_f32(src_address + 0); + float32x4_t s2 = vld1q_f32(src_address + 4); + float32x4_t s3 = vld1q_f32(src_address + 8); + float32x4_t s4 = vld1q_f32(src_address + 12); + + src_address += srcstep; + + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + internal::prefetch(src_address + srcstep, 32); + + float32x4_t v1 = vld1q_f32(src_address + 0); + float32x4_t v2 = vld1q_f32(src_address + 4); + float32x4_t v3 = vld1q_f32(src_address + 8); + float32x4_t v4 = vld1q_f32(src_address + 12); + + s1 = vaddq_f32(s1, v1); + s2 = vaddq_f32(s2, v2); + s3 = vaddq_f32(s3, v3); + s4 = vaddq_f32(s4, v4); + } + + vst1q_f32(dstBase + i + 0, s1); + vst1q_f32(dstBase + i + 4, s2); + vst1q_f32(dstBase + i + 8, s3); + vst1q_f32(dstBase + i + 12, s4); + } + + for (; i + 4 <= size.width; i += 4) + { + const f32* src_address = srcBase + i; + float32x4_t s1 = vld1q_f32(src_address); + src_address += srcstep; + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + + float32x4_t v1 = vld1q_f32(src_address); + s1 = vaddq_f32(s1, v1); + } + vst1q_f32(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + { + for(size_t j = i ; j < size.width; j++ ) + { + dstBase[j] += srcBase[j + srcstep * h]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColMax(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width*sizeof(f32)); + size_t srcstep = srcStride/sizeof(f32); + size_t i = 0; + for (; i + 16 <= size.width; i += 16) + { + const f32* src_address = srcBase + i; + + float32x4_t s1 = vld1q_f32(src_address + 0); + float32x4_t s2 = vld1q_f32(src_address + 4); + float32x4_t s3 = vld1q_f32(src_address + 8); + float32x4_t s4 = vld1q_f32(src_address + 12); + + src_address += srcstep; + + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + internal::prefetch(src_address + srcstep, 32); + + float32x4_t v1 = vld1q_f32(src_address + 0); + float32x4_t v2 = vld1q_f32(src_address + 4); + float32x4_t v3 = vld1q_f32(src_address + 8); + float32x4_t v4 = vld1q_f32(src_address + 12); + + s1 = vmaxq_f32(s1, v1); + s2 = vmaxq_f32(s2, v2); + s3 = vmaxq_f32(s3, v3); + s4 = vmaxq_f32(s4, v4); + } + + vst1q_f32(dstBase + i + 0, s1); + vst1q_f32(dstBase + i + 4, s2); + vst1q_f32(dstBase + i + 8, s3); + vst1q_f32(dstBase + i + 12, s4); + } + + for (; i + 4 <= size.width; i += 4) + { + const f32* src_address = srcBase + i; + float32x4_t s1 = vld1q_f32(src_address); + src_address += srcstep; + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + + float32x4_t v1 = vld1q_f32(src_address); + s1 = vmaxq_f32(s1, v1); + } + vst1q_f32(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + for(size_t j = i ; j < size.width; j++ ) + dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColMin(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width*sizeof(f32)); + size_t srcstep = srcStride/sizeof(f32); + size_t i = 0; + for (; i + 16 <= size.width; i += 16) + { + const f32* src_address = srcBase + i; + + float32x4_t s1 = vld1q_f32(src_address + 0); + float32x4_t s2 = vld1q_f32(src_address + 4); + float32x4_t s3 = vld1q_f32(src_address + 8); + float32x4_t s4 = vld1q_f32(src_address + 12); + + src_address += srcstep; + + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + internal::prefetch(src_address + srcstep, 32); + + float32x4_t v1 = vld1q_f32(src_address + 0); + float32x4_t v2 = vld1q_f32(src_address + 4); + float32x4_t v3 = vld1q_f32(src_address + 8); + float32x4_t v4 = vld1q_f32(src_address + 12); + + s1 = vminq_f32(s1, v1); + s2 = vminq_f32(s2, v2); + s3 = vminq_f32(s3, v3); + s4 = vminq_f32(s4, v4); + } + + vst1q_f32(dstBase + i + 0, s1); + vst1q_f32(dstBase + i + 4, s2); + vst1q_f32(dstBase + i + 8, s3); + vst1q_f32(dstBase + i + 12, s4); + } + + for (; i + 4 <= size.width; i += 4) + { + const f32* src_address = srcBase + i; + float32x4_t s1 = vld1q_f32(src_address); + src_address += srcstep; + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + + float32x4_t v1 = vld1q_f32(src_address); + s1 = vminq_f32(s1, v1); + } + vst1q_f32(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + for(size_t j = i ; j < size.width; j++ ) + dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/remap.cpp b/3rdparty/carotene/src/remap.cpp new file mode 100644 index 0000000000..a4b99c3db0 --- /dev/null +++ b/3rdparty/carotene/src/remap.cpp @@ -0,0 +1,694 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "remap.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace internal { + +void remapNearestNeighborReplicate(const Size2D size, + const u8 * srcBase, + const s32 * map, + u8 * dstBase, ptrdiff_t dstStride) +{ + for (size_t y = 0; y < size.height; ++y) + { + const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y); + + for (size_t x = 0; x < size.width; ++x) + { + dst_row[x] = srcBase[map_row[x]]; + } + } +} + +void remapNearestNeighborConst(const Size2D size, + const u8 * srcBase, + const s32 * map, + u8 * dstBase, ptrdiff_t dstStride, + u8 borderValue) +{ + for (size_t y = 0; y < size.height; ++y) + { + const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y); + + for (size_t x = 0; x < size.width; ++x) + { + s32 src_idx = map_row[x]; + dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue; + } + } +} + +void remapLinearReplicate(const Size2D size, + const u8 * srcBase, + const s32 * map, + const f32 * coeffs, + u8 * dstBase, ptrdiff_t dstStride) +{ + int16x8_t v_zero16 = vdupq_n_s16(0); + + for (size_t y = 0; y < size.height; ++y) + { + const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y); + const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y); + + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y); + + size_t x = 0; + for ( ; x + 8 < size.width; x += 8) + { + int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7); + + int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7); + + int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7); + + int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7); + + // first part + float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + + float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1)); + float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01), + vget_low_s16(v_src00))), v_coeff.val[0]); + float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11), + vget_low_s16(v_src10))), v_coeff.val[0]); + + float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // second part + v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + + v_coeff = vld2q_f32(coeff_row + (x << 1) + 8); + v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01), + vget_high_s16(v_src00))), v_coeff.val[0]); + v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11), + vget_high_s16(v_src10))), v_coeff.val[0]); + + v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // store + vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1))); + } + + for ( ; x < size.width; ++x) + { + s32 src00_index = map_row[(x << 2)]; + s32 src10_index = map_row[(x << 2) + 2]; + f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] + + srcBase[src00_index]; + f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] + + srcBase[src10_index]; + dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0); + } + } +} + +void remapLinearConst(const Size2D size, + const u8 * srcBase, + const s32 * map, + const f32 * coeffs, + u8 * dstBase, ptrdiff_t dstStride, + u8 borderValue) +{ + int16x8_t v_zero16 = vdupq_n_s16(0); + + for (size_t y = 0; y < size.height; ++y) + { + const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y); + const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y); + + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y); + + size_t x = 0; + for ( ; x + 8 < size.width; x += 8) + { + int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7); + + int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7); + + int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7); + + int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7); + + // first part + float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + + float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1)); + float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01), + vget_low_s16(v_src00))), v_coeff.val[0]); + float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11), + vget_low_s16(v_src10))), v_coeff.val[0]); + + float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // second part + v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + + v_coeff = vld2q_f32(coeff_row + (x << 1) + 8); + v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01), + vget_high_s16(v_src00))), v_coeff.val[0]); + v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11), + vget_high_s16(v_src10))), v_coeff.val[0]); + + v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // store + vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1))); + } + + for ( ; x < size.width; ++x) + { + s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue; + s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue; + s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue; + s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue; + + f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00; + f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10; + dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0); + } + } +} + +} // namespace internal + +#endif // CAROTENE_NEON + +bool isRemapNearestNeighborSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +bool isRemapLinearSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * tableBase, ptrdiff_t tableStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + s32 * map = alignPtr(_map, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride); + int32x2_t v_step2 = vdup_n_s32(srcStride); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + int32x2_t v_zero2 = vdup_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1); + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0; + for ( ; x + 8 <= blockWidth; x += 8) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)), + v_table1 = vld2q_f32(table_row + (x << 1) + 8); + + int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0]))); + int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1]))); + int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4); + vst1q_s32(map_row + x, v_dst_index); + + v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0]))); + v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1]))); + v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4); + vst1q_s32(map_row + x + 4, v_dst_index); + } + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0]))); + int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1]))); + int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4); + vst1q_s32(map_row + x, v_dst_index); + } + + for ( ; x + 2 <= blockWidth; x += 2) + { + float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1)); + + int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0]))); + int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1]))); + int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2); + vst1_s32(map_row + x, v_dst_index); + } + + for ( ; x < blockWidth; ++x) + { + s32 src_x = std::max(0, std::min(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0]))); + s32 src_y = std::max(0, std::min(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1]))); + map_row[x] = src_y * srcStride + src_x; + } + } + + // make remap + remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + int32x4_t v_m1_4 = vdupq_n_s32(-1); + int32x2_t v_m1_2 = vdup_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + float32x2_t v_zero2 = vdup_n_f32(0.0f); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1); + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0; + for ( ; x + 8 <= blockWidth; x += 8) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)), + v_table1 = vld2q_f32(table_row + (x << 1) + 8); + + int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]); + int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)), + vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4))); + int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_dst_index); + + v_dst_x = vcvtq_s32_f32(v_table1.val[0]); + v_dst_y = vcvtq_s32_f32(v_table1.val[1]); + v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)), + vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4))); + v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4); + vst1q_s32(map_row + x + 4, v_dst_index); + } + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]); + int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)), + vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4))); + int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_dst_index); + } + + for ( ; x + 2 <= blockWidth; x += 2) + { + float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1)); + + int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]); + int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]); + uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)), + vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2))); + int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2); + vst1_s32(map_row + x, v_dst_index); + } + + for ( ; x < blockWidth; ++x) + { + s32 src_x = (s32)floorf(table_row[(x << 1) + 0]); + s32 src_y = (s32)floorf(table_row[(x << 1) + 1]); + map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) && + (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1; + } + } + + // make remap + remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } + +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)tableBase; + (void)tableStride; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +void remapLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * tableBase, ptrdiff_t tableStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isRemapLinearSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + + s32 * map = alignPtr(_map, 16); + f32 * coeffs = alignPtr(_coeffs, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1); + + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0; + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]); + int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x)); + v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x); + v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y); + + int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x)); + int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y)); + int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x))); + int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y))); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + } + + for ( ; x < blockWidth; ++x) + { + f32 src_x_f = table_row[(x << 1) + 0]; + f32 src_y_f = table_row[(x << 1) + 1]; + + s32 src0_x = (s32)floorf(src_x_f); + s32 src0_y = (s32)floorf(src_y_f); + + coeff_row[x << 1] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + s32 src1_y = std::max(0, std::min(ssize.height - 1, src0_y + 1)); + src0_y = std::max(0, std::min(ssize.height - 1, src0_y)); + s32 src1_x = std::max(0, std::min(ssize.width - 1, src0_x + 1)); + src0_x = std::max(0, std::min(ssize.width - 1, src0_x)); + + map_row[(x << 2) + 0] = src0_y * srcStride + src0_x; + map_row[(x << 2) + 1] = src0_y * srcStride + src1_x; + map_row[(x << 2) + 2] = src1_y * srcStride + src0_x; + map_row[(x << 2) + 3] = src1_y * srcStride + src1_x; + } + } + + remapLinearReplicate(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1); + + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0; + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + } + + for ( ; x < blockWidth; ++x) + { + f32 src_x_f = table_row[(x << 1) + 0]; + f32 src_y_f = table_row[(x << 1) + 1]; + + s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1; + s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1; + + coeff_row[(x << 1)] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1; + map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1; + map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1; + map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1; + } + } + + remapLinearConst(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)tableBase; + (void)tableStride; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/remap.hpp b/3rdparty/carotene/src/remap.hpp new file mode 100644 index 0000000000..0f9765965f --- /dev/null +++ b/3rdparty/carotene/src/remap.hpp @@ -0,0 +1,85 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_REMAP_HPP +#define CAROTENE_SRC_REMAP_HPP + +#include "common.hpp" + +#include + +#ifdef CAROTENE_NEON + +namespace CAROTENE_NS { namespace internal { + +enum +{ + BLOCK_SIZE = 32 +}; + + +void remapNearestNeighborReplicate(const Size2D size, + const u8 * srcBase, + const s32 * map, + u8 * dstBase, ptrdiff_t dstStride); + +void remapNearestNeighborConst(const Size2D size, + const u8 * srcBase, + const s32 * map, + u8 * dstBase, ptrdiff_t dstStride, + u8 borderValue); + +void remapLinearReplicate(const Size2D size, + const u8 * srcBase, + const s32 * map, + const f32 * coeffs, + u8 * dstBase, ptrdiff_t dstStride); + +void remapLinearConst(const Size2D size, + const u8 * srcBase, + const s32 * map, + const f32 * coeffs, + u8 * dstBase, ptrdiff_t dstStride, + u8 borderValue); + +} } + +#endif // CAROTENE_NEON + +#endif // CAROTENE_SRC_REMAP_HPP diff --git a/3rdparty/carotene/src/resize.cpp b/3rdparty/carotene/src/resize.cpp new file mode 100644 index 0000000000..122a5f2201 --- /dev/null +++ b/3rdparty/carotene/src/resize.cpp @@ -0,0 +1,2191 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include +#include +#include + +namespace CAROTENE_NS { + +bool isResizeNearestNeighborSupported(const Size2D &ssize, u32 elemSize) +{ +#if SIZE_MAX <= UINT32_MAX + (void)ssize; +#endif + bool supportedElemSize = (elemSize == 1) || (elemSize == 3) || (elemSize == 4); + return isSupportedConfiguration() +#if SIZE_MAX > UINT32_MAX + && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internally used resizeGeneric performs + // index evaluation with u32 +#endif + && supportedElemSize; +} + +bool isResizeAreaSupported(f32 wr, f32 hr, u32 channels) +{ + bool supportedRatio = false; + + if (channels == 1) + supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5)); + else if (channels == 3) + supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f)); + else if (channels == 4) + supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f)); + + return isSupportedConfiguration() && supportedRatio; +} + +bool isResizeLinearSupported(const Size2D &ssize, const Size2D &dsize, + f32 wr, f32 hr, u32 channels) +{ + if ((wr <= 2.0f) && (hr <= 2.0f)) + { + bool channelsSupport = (channels == 1) || (channels == 3) || (channels == 4); + return (ssize.width >= 16) && (dsize.height >= 8) && + (dsize.width >= 8) && channelsSupport; + } + + return false; +} + +bool isResizeLinearOpenCVSupported(const Size2D &ssize, const Size2D &dsize, u32 channels) +{ + switch(channels) + { + case 1: + if (ssize.width >= 8 +#if SIZE_MAX > UINT32_MAX + && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation + // is performed with u32 +#endif + && dsize.width >= 8 && dsize.height >= 8) + return isSupportedConfiguration(); + return false; + case 4: + if (ssize.width >= 2 +#if SIZE_MAX > UINT32_MAX + && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation + // is performed with u32 +#endif + && dsize.width >= 2 && dsize.height >= 8) + return isSupportedConfiguration(); + default: + return false; + }; +} + +#ifdef CAROTENE_NEON + +namespace { + +u32 * calcLUT(size_t size, f32 ratio, + std::vector & _ofs) +{ + _ofs.resize(size); + u32 * ofs = &_ofs[0]; + + size_t roiw8 = size >= 7 ? size - 7 : 0; + size_t roiw4 = size >= 3 ? size - 3 : 0; + size_t x = 0; + + f32 indeces[4] = { 0, 1, 2, 3 }; + float32x4_t v_index = vld1q_f32(indeces), v_inc = vdupq_n_f32(4); + float32x4_t v_05 = vdupq_n_f32(0.5f), v_ratio = vdupq_n_f32(ratio); + + for ( ; x < roiw8; x += 8) + { + float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio); + vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf)); + v_index = vaddq_f32(v_index, v_inc); + + v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio); + vst1q_u32(ofs + x + 4, vcvtq_u32_f32(v_dstf)); + v_index = vaddq_f32(v_index, v_inc); + } + + for ( ; x < roiw4; x += 4) + { + float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio); + vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf)); + v_index = vaddq_f32(v_index, v_inc); + } + + for ( ; x < size; ++x) + { + ofs[x] = static_cast(floorf((x + 0.5f) * ratio)); + } + + return ofs; +} + +template +void resizeGeneric(const Size2D &dsize, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr) +{ + std::vector _x_ofs; + u32 * x_ofs = calcLUT(dsize.width, wr, _x_ofs);//32bit LUT is used so we could get issues on src image dimensions greater than (2^32-1) + + for (size_t dst_y = 0; dst_y < dsize.height; ++dst_y) + { + size_t src_y = static_cast(floorf((dst_y + 0.5f) * hr)); + const T * src = internal::getRowPtr(static_cast(srcBase), srcStride, src_y); + T * dst = internal::getRowPtr(static_cast(dstBase), dstStride, dst_y); + + for (size_t dst_x = 0; dst_x < dsize.width; ++dst_x) + { + internal::prefetch(src + dst_x); + dst[dst_x] = src[x_ofs[dst_x]]; + } + } +} + +typedef struct _24bit_ +{ + u8 a[3]; +} _24bit; + +} // namespace + + +#endif + +void resizeNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 elemSize) +{ + internal::assertSupportedConfiguration(wr > 0 && hr > 0 && + (dsize.width - 0.5) * wr < ssize.width && + (dsize.height - 0.5) * hr < ssize.height && // Ensure we have enough source data + (dsize.width + 0.5) * wr >= ssize.width && + (dsize.height + 0.5) * hr >= ssize.height && // Ensure source isn't too big + isResizeNearestNeighborSupported(ssize, elemSize)); +#ifdef CAROTENE_NEON + + if (elemSize == 1) + { + resizeGeneric(dsize, + srcBase, srcStride, + dstBase, dstStride, + wr, hr); + } + else if (elemSize == 3) + { + resizeGeneric<_24bit>(dsize, + srcBase, srcStride, + dstBase, dstStride, + wr, hr); + } + else if (elemSize == 4) + { + resizeGeneric(dsize, + srcBase, srcStride, + dstBase, dstStride, + wr, hr); + } + +#else + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)wr; + (void)hr; +#endif +} + +#ifdef CAROTENE_NEON +template +inline uint8x8_t areaDownsamplingDivision(uint16x8_t data) +{ + return vshrn_n_u16(data, shiftsize); +} +template <> +inline uint8x8_t areaDownsamplingDivision(uint16x8_t data) +{ + // rounding + return vrshrn_n_u16(data,2); +} +template <> +inline uint8x8_t areaDownsamplingDivision(uint16x8_t data) +{ + // bankers rounding + return vrshrn_n_u16(vqsubq_u16(data, vshrq_n_u16(vbicq_u16(vdupq_n_u16(1<<4), data), 4)),4); +} + +template +inline u8 areaDownsamplingDivision(u16 data) +{ + return data >> shiftsize; +} +template <> +inline u8 areaDownsamplingDivision(u16 data) +{ + // rounding + return (data + 2) >> 2; +} +template <> +inline u8 areaDownsamplingDivision(u16 data) +{ + // bankers rounding + return (data - (((1<<4) & ~data) >> 4) + 8) >> 4; +} +#endif + +template +inline void resizeAreaRounding(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + internal::assertSupportedConfiguration(isResizeAreaSupported(wr, hr, channels) && + std::abs(dsize.width * wr - ssize.width) < 0.1 && + std::abs(dsize.height * hr - ssize.height) < 0.1); +#ifdef CAROTENE_NEON + if (channels == 1) + { + if ((wr == 2.0f) && (hr == 2.0f)) + { + size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + + for ( ; dj < roiw8; dj += 8, sj += 16) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint16x8_t vSum1 = vpaddlq_u8(vld1q_u8(src0_row + sj)); + uint16x8_t vSum2 = vpaddlq_u8(vld1q_u8(src1_row + sj)); + uint8x8_t vRes1 = areaDownsamplingDivision(vaddq_u16(vSum1, vSum2)); + + vst1_u8(dst_row + dj, vRes1); + } + + for ( ; dj < dsize.width; ++dj, sj += 2) + { + dst_row[dj] = areaDownsamplingDivision( + (u16)src0_row[sj] + src0_row[sj + 1] + + src1_row[sj] + src1_row[sj + 1]); + } + } + } + else if ((wr == 0.5f) && (hr == 0.5f)) + { + size_t roiw32 = dsize.width >= 31 ? dsize.width - 31 : 0; + size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0; + + for (size_t i = 0; i < dsize.height; i += 2) + { + const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1); + u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1)); + size_t sj = 0, dj = 0; + + for ( ; dj < roiw32; dj += 32, sj += 16) + { + internal::prefetch(src_row + sj); + + uint8x16x2_t v_dst; + v_dst.val[0] = v_dst.val[1] = vld1q_u8(src_row + sj); + + vst2q_u8(dst0_row + dj, v_dst); + vst2q_u8(dst1_row + dj, v_dst); + } + + for ( ; dj < roiw16; dj += 16, sj += 8) + { + uint8x8x2_t v_dst; + v_dst.val[0] = v_dst.val[1] = vld1_u8(src_row + sj); + + vst2_u8(dst0_row + dj, v_dst); + vst2_u8(dst1_row + dj, v_dst); + } + + for ( ; dj < dsize.width; dj += 2, ++sj) + { + u8 src_val = src_row[sj]; + dst0_row[dj] = dst0_row[dj + 1] = src_val; + dst1_row[dj] = dst1_row[dj + 1] = src_val; + } + } + } + else //if ((wr == 4.0f) && (hr == 4.0f)) //the only scale that lasts after isSupported check + { +#ifndef ANDROID + size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0; +#endif + size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1); + const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2); + const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw16; dj += 16, sj += 64) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + internal::prefetch(src2_row + sj); + internal::prefetch(src3_row + sj); + + uint8x16x4_t vLane1 = vld4q_u8(src0_row + sj); + uint8x16x4_t vLane2 = vld4q_u8(src1_row + sj); + uint8x16x4_t vLane3 = vld4q_u8(src2_row + sj); + uint8x16x4_t vLane4 = vld4q_u8(src3_row + sj); + + uint16x8_t vSum_0 = vaddl_u8(vget_low_u8(vLane1.val[0]), vget_low_u8(vLane1.val[1])); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane1.val[2]), vget_low_u8(vLane1.val[3]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[0]), vget_low_u8(vLane2.val[1]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[2]), vget_low_u8(vLane2.val[3]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[0]), vget_low_u8(vLane3.val[1]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[2]), vget_low_u8(vLane3.val[3]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[0]), vget_low_u8(vLane4.val[1]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[2]), vget_low_u8(vLane4.val[3]))); + + uint16x8_t vSum_1 = vaddl_u8(vget_high_u8(vLane1.val[0]), vget_high_u8(vLane1.val[1])); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane1.val[2]), vget_high_u8(vLane1.val[3]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[0]), vget_high_u8(vLane2.val[1]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[2]), vget_high_u8(vLane2.val[3]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[0]), vget_high_u8(vLane3.val[1]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[2]), vget_high_u8(vLane3.val[3]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[0]), vget_high_u8(vLane4.val[1]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[2]), vget_high_u8(vLane4.val[3]))); + + uint8x8_t vRes_0 = areaDownsamplingDivision(vSum_0); + uint8x8_t vRes_1 = areaDownsamplingDivision(vSum_1); + + vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1)); + } +#endif + + for ( ; dj < roiw8; dj += 8, sj += 32) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + internal::prefetch(src2_row + sj); + internal::prefetch(src3_row + sj); + + uint8x8x4_t vLane1 = vld4_u8(src0_row + sj); + uint8x8x4_t vLane2 = vld4_u8(src1_row + sj); + uint8x8x4_t vLane3 = vld4_u8(src2_row + sj); + uint8x8x4_t vLane4 = vld4_u8(src3_row + sj); + + uint16x8_t vSum = vaddl_u8(vLane1.val[0], vLane1.val[1]); + vSum = vaddq_u16(vSum, vaddl_u8(vLane1.val[2], vLane1.val[3])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[0], vLane2.val[1])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[2], vLane2.val[3])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[0], vLane3.val[1])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[2], vLane3.val[3])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[0], vLane4.val[1])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[2], vLane4.val[3])); + + vst1_u8(dst_row + dj, areaDownsamplingDivision(vSum)); + } + + for ( ; dj < dsize.width; ++dj, sj += 4) + { + dst_row[dj] = areaDownsamplingDivision( + (u16)src0_row[sj] + src0_row[sj + 1] + src0_row[sj + 2] + src0_row[sj + 3] + + src1_row[sj] + src1_row[sj + 1] + src1_row[sj + 2] + src1_row[sj + 3] + + src2_row[sj] + src2_row[sj + 1] + src2_row[sj + 2] + src2_row[sj + 3] + + src3_row[sj] + src3_row[sj + 1] + src3_row[sj + 2] + src3_row[sj + 3]); + } + } + } + } + else if (channels == 4) + { + if ((wr == 2.0f) && (hr == 2.0f)) + { +#ifndef ANDROID + size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0; +#endif + size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw4; dj += 16, sj += 32) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint8x8_t vRes_0, vRes_1; + + { + uint8x16_t vLane1 = vld1q_u8(src0_row + sj); + uint8x16_t vLane2 = vld1q_u8(src1_row + sj); + + uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2)); + uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2)); + + uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l)); + uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h)); + + vRes_0 = areaDownsamplingDivision(vcombine_u16(vSum_l, vSum_h)); + } + + { + uint8x16_t vLane1 = vld1q_u8(src0_row + sj + 16); + uint8x16_t vLane2 = vld1q_u8(src1_row + sj + 16); + + uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2)); + uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2)); + + uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l)); + uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h)); + + vRes_1 = areaDownsamplingDivision(vcombine_u16(vSum_l, vSum_h)); + } + + vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1)); + } +#endif + + for ( ; dj < roiw2; dj += 8, sj += 16) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint8x16_t vLane1 = vld1q_u8(src0_row + sj); + uint8x16_t vLane2 = vld1q_u8(src1_row + sj); + + uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2)); + uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2)); + + uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l)); + uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h)); + + uint8x8_t vRes = areaDownsamplingDivision(vcombine_u16(vSum_l, vSum_h)); + vst1_u8(dst_row + dj, vRes); + } + + for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 8) + { + dst_row[dj ] = areaDownsamplingDivision( + (u16)src0_row[sj ] + src0_row[sj + 4] + + src1_row[sj ] + src1_row[sj + 4]); + dst_row[dj + 1] = areaDownsamplingDivision( + (u16)src0_row[sj + 1] + src0_row[sj + 5] + + src1_row[sj + 1] + src1_row[sj + 5]); + dst_row[dj + 2] = areaDownsamplingDivision( + (u16)src0_row[sj + 2] + src0_row[sj + 6] + + src1_row[sj + 2] + src1_row[sj + 6]); + dst_row[dj + 3] = areaDownsamplingDivision( + (u16)src0_row[sj + 3] + src0_row[sj + 7] + + src1_row[sj + 3] + src1_row[sj + 7]); + } + } + } + else if ((wr == 0.5f) && (hr == 0.5f)) + { +#ifndef ANDROID + size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) << 2 : 0; +#endif + size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) << 2 : 0; + + for (size_t i = 0; i < dsize.height; i += 2) + { + const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1); + u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1)); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw32; dj += 128, sj += 64) + { + internal::prefetch(src_row + sj); + + uint8x16x4_t v_src = vld4q_u8(src_row + sj); + uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]); + uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]); + uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]); + uint8x16x2_t v_c3 = vzipq_u8(v_src.val[3], v_src.val[3]); + + uint8x16x4_t v_dst; + v_dst.val[0] = v_c0.val[0]; + v_dst.val[1] = v_c1.val[0]; + v_dst.val[2] = v_c2.val[0]; + v_dst.val[3] = v_c3.val[0]; + vst4q_u8(dst0_row + dj, v_dst); + vst4q_u8(dst1_row + dj, v_dst); + + v_dst.val[0] = v_c0.val[1]; + v_dst.val[1] = v_c1.val[1]; + v_dst.val[2] = v_c2.val[1]; + v_dst.val[3] = v_c3.val[1]; + vst4q_u8(dst0_row + dj + 64, v_dst); + vst4q_u8(dst1_row + dj + 64, v_dst); + } +#endif + + for ( ; dj < roiw16; dj += 64, sj += 32) + { + internal::prefetch(src_row + sj); + + uint8x8x4_t v_src = vld4_u8(src_row + sj); + uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]); + uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]); + uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]); + uint8x8x2_t v_c3 = vzip_u8(v_src.val[3], v_src.val[3]); + + uint8x16x4_t v_dst; + v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]); + v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]); + v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]); + v_dst.val[3] = vcombine_u8(v_c3.val[0], v_c3.val[1]); + vst4q_u8(dst0_row + dj, v_dst); + vst4q_u8(dst1_row + dj, v_dst); + } + + for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 8, sj += 4) + { + u8 src_val = src_row[sj]; + dst0_row[dj] = dst0_row[dj + 4] = src_val; + dst1_row[dj] = dst1_row[dj + 4] = src_val; + + src_val = src_row[sj + 1]; + dst0_row[dj + 1] = dst0_row[dj + 5] = src_val; + dst1_row[dj + 1] = dst1_row[dj + 5] = src_val; + + src_val = src_row[sj + 2]; + dst0_row[dj + 2] = dst0_row[dj + 6] = src_val; + dst1_row[dj + 2] = dst1_row[dj + 6] = src_val; + + src_val = src_row[sj + 3]; + dst0_row[dj + 3] = dst0_row[dj + 7] = src_val; + dst1_row[dj + 3] = dst1_row[dj + 7] = src_val; + } + } + } + else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check + { + size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0; + size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1); + const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2); + const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + + for ( ; dj < roiw4; dj += 16, sj += 64) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + internal::prefetch(src2_row + sj); + internal::prefetch(src3_row + sj); + + uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16); + uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16); + uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16); + uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16); + + uint16x8_t v_part_0, v_part_1; + { + uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10)); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40))); + + uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11)); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41))); + + v_part_0 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)), + vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1))); + } + + vLane10 = vld1q_u8(src0_row + sj + 32); + vLane11 = vld1q_u8(src0_row + sj + 48); + vLane20 = vld1q_u8(src1_row + sj + 32); + vLane21 = vld1q_u8(src1_row + sj + 48); + vLane30 = vld1q_u8(src2_row + sj + 32); + vLane31 = vld1q_u8(src2_row + sj + 48); + vLane40 = vld1q_u8(src3_row + sj + 32); + vLane41 = vld1q_u8(src3_row + sj + 48); + + { + uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10)); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40))); + + uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11)); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41))); + + v_part_1 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)), + vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1))); + } + + vst1q_u8(dst_row + dj, vcombine_u8(areaDownsamplingDivision(v_part_0), + areaDownsamplingDivision(v_part_1))); + } + + for ( ; dj < roiw2; dj += 8, sj += 32) + { + uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16); + uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16); + uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16); + uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16); + + uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10)); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40))); + + uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11)); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41))); + + uint16x8_t v_sum = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)), + vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1))); + + vst1_u8(dst_row + dj, areaDownsamplingDivision(v_sum)); + } + + for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 16) + { + dst_row[dj ] = areaDownsamplingDivision( + (u16)src0_row[sj ] + src0_row[sj + 4] + + src0_row[sj + 8] + src0_row[sj + 12] + + src1_row[sj ] + src1_row[sj + 4] + + src1_row[sj + 8] + src1_row[sj + 12] + + src2_row[sj ] + src2_row[sj + 4] + + src2_row[sj + 8] + src2_row[sj + 12] + + src3_row[sj ] + src3_row[sj + 4] + + src3_row[sj + 8] + src3_row[sj + 12]); + + dst_row[dj + 1] = areaDownsamplingDivision( + (u16)src0_row[sj + 1] + src0_row[sj + 5] + + src0_row[sj + 9] + src0_row[sj + 13] + + src1_row[sj + 1] + src1_row[sj + 5] + + src1_row[sj + 9] + src1_row[sj + 13] + + src2_row[sj + 1] + src2_row[sj + 5] + + src2_row[sj + 9] + src2_row[sj + 13] + + src3_row[sj + 1] + src3_row[sj + 5] + + src3_row[sj + 9] + src3_row[sj + 13]); + + dst_row[dj + 2] = areaDownsamplingDivision( + (u16)src0_row[sj + 2] + src0_row[sj + 6] + + src0_row[sj + 10] + src0_row[sj + 14] + + src1_row[sj + 2] + src1_row[sj + 6] + + src1_row[sj + 10] + src1_row[sj + 14] + + src2_row[sj + 2] + src2_row[sj + 6] + + src2_row[sj + 10] + src2_row[sj + 14] + + src3_row[sj + 2] + src3_row[sj + 6] + + src3_row[sj + 10] + src3_row[sj + 14]); + + dst_row[dj + 3] = areaDownsamplingDivision( + (u16)src0_row[sj + 3] + src0_row[sj + 7] + + src0_row[sj + 11] + src0_row[sj + 15] + + src1_row[sj + 3] + src1_row[sj + 7] + + src1_row[sj + 11] + src1_row[sj + 15] + + src2_row[sj + 3] + src2_row[sj + 7] + + src2_row[sj + 11] + src2_row[sj + 15] + + src3_row[sj + 3] + src3_row[sj + 7] + + src3_row[sj + 11] + src3_row[sj + 15]); + } + } + } + } + else if (channels == 3) + { + if ((wr == 2.0f) && (wr == 2.0f)) + { +#ifndef ANDROID + size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0; +#endif + size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw16; dj += 48, sj += 96) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj); + uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj); + + uint8x8x3_t v_dst0, v_dst1; + { + uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]); + uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]); + uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]); + v_el0 = vpadalq_u8(v_el0, vLane2.val[0]); + v_el1 = vpadalq_u8(v_el1, vLane2.val[1]); + v_el2 = vpadalq_u8(v_el2, vLane2.val[2]); + + v_dst0.val[0] = areaDownsamplingDivision(v_el0); + v_dst0.val[1] = areaDownsamplingDivision(v_el1); + v_dst0.val[2] = areaDownsamplingDivision(v_el2); + } + + vLane1 = vld3q_u8(src0_row + sj + 48); + vLane2 = vld3q_u8(src1_row + sj + 48); + { + uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]); + uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]); + uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]); + v_el0 = vpadalq_u8(v_el0, vLane2.val[0]); + v_el1 = vpadalq_u8(v_el1, vLane2.val[1]); + v_el2 = vpadalq_u8(v_el2, vLane2.val[2]); + + v_dst1.val[0] = areaDownsamplingDivision(v_el0); + v_dst1.val[1] = areaDownsamplingDivision(v_el1); + v_dst1.val[2] = areaDownsamplingDivision(v_el2); + } + + uint8x16x3_t v_dst; + v_dst.val[0] = vcombine_u8(v_dst0.val[0], v_dst1.val[0]); + v_dst.val[1] = vcombine_u8(v_dst0.val[1], v_dst1.val[1]); + v_dst.val[2] = vcombine_u8(v_dst0.val[2], v_dst1.val[2]); + + vst3q_u8(dst_row + dj, v_dst); + } +#endif + + for ( ; dj < roiw8; dj += 24, sj += 48) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj); + uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj); + + uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]); + uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]); + uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]); + v_el0 = vpadalq_u8(v_el0, vLane2.val[0]); + v_el1 = vpadalq_u8(v_el1, vLane2.val[1]); + v_el2 = vpadalq_u8(v_el2, vLane2.val[2]); + + uint8x8x3_t v_dst; + v_dst.val[0] = areaDownsamplingDivision(v_el0); + v_dst.val[1] = areaDownsamplingDivision(v_el1); + v_dst.val[2] = areaDownsamplingDivision(v_el2); + + vst3_u8(dst_row + dj, v_dst); + } + + for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 6) + { + dst_row[dj ] = areaDownsamplingDivision( + (u16)src0_row[sj ] + src0_row[sj + 3] + + src1_row[sj ] + src1_row[sj + 3]); + dst_row[dj + 1] = areaDownsamplingDivision( + (u16)src0_row[sj + 1] + src0_row[sj + 4] + + src1_row[sj + 1] + src1_row[sj + 4]); + dst_row[dj + 2] = areaDownsamplingDivision( + (u16)src0_row[sj + 2] + src0_row[sj + 5] + + src1_row[sj + 2] + src1_row[sj + 5]); + } + } + } + else if ((wr == 0.5f) && (hr == 0.5f)) + { +#ifndef ANDROID + size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) * 3 : 0; +#endif + size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0; + + for (size_t i = 0; i < dsize.height; i += 2) + { + const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1); + u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1)); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw32; dj += 96, sj += 48) + { + internal::prefetch(src_row + sj); + + uint8x16x3_t v_src = vld3q_u8(src_row + sj); + uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]); + uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]); + uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]); + + uint8x16x3_t v_dst; + v_dst.val[0] = v_c0.val[0]; + v_dst.val[1] = v_c1.val[0]; + v_dst.val[2] = v_c2.val[0]; + vst3q_u8(dst0_row + dj, v_dst); + vst3q_u8(dst1_row + dj, v_dst); + + v_dst.val[0] = v_c0.val[1]; + v_dst.val[1] = v_c1.val[1]; + v_dst.val[2] = v_c2.val[1]; + vst3q_u8(dst0_row + dj + 48, v_dst); + vst3q_u8(dst1_row + dj + 48, v_dst); + } +#endif + + for ( ; dj < roiw16; dj += 48, sj += 24) + { + internal::prefetch(src_row + sj); + + uint8x8x3_t v_src = vld3_u8(src_row + sj); + uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]); + uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]); + uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]); + + uint8x16x3_t v_dst; + v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]); + v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]); + v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]); + vst3q_u8(dst0_row + dj, v_dst); + vst3q_u8(dst1_row + dj, v_dst); + } + + for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 6, sj += 3) + { + u8 src_val = src_row[sj]; + dst0_row[dj] = dst0_row[dj + 3] = src_val; + dst1_row[dj] = dst1_row[dj + 3] = src_val; + + src_val = src_row[sj + 1]; + dst0_row[dj + 1] = dst0_row[dj + 4] = src_val; + dst1_row[dj + 1] = dst1_row[dj + 4] = src_val; + + src_val = src_row[sj + 2]; + dst0_row[dj + 2] = dst0_row[dj + 5] = src_val; + dst1_row[dj + 2] = dst1_row[dj + 5] = src_val; + } + } + } + else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check + { +#ifndef ANDROID + size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0; +#endif + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1); + const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2); + const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw8; dj += 24, sj += 96) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + internal::prefetch(src2_row + sj); + internal::prefetch(src3_row + sj); + + uint8x16x3_t vLane10 = vld3q_u8(src0_row + sj), vLane11 = vld3q_u8(src0_row + sj + 48); + uint8x16x3_t vLane20 = vld3q_u8(src1_row + sj), vLane21 = vld3q_u8(src1_row + sj + 48); + uint8x16x3_t vLane30 = vld3q_u8(src2_row + sj), vLane31 = vld3q_u8(src2_row + sj + 48); + uint8x16x3_t vLane40 = vld3q_u8(src3_row + sj), vLane41 = vld3q_u8(src3_row + sj + 48); + + uint8x8x3_t v_dst; + + // channel 0 + { + uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[0]); + uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[0]); + uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[0]); + uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[0]); + v_lane0 = vaddq_u16(v_lane0, v_lane1); + v_lane0 = vaddq_u16(v_lane0, v_lane2); + v_lane0 = vaddq_u16(v_lane0, v_lane3); + + uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[0]); + uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[0]); + uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[0]); + uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[0]); + v_lane0_ = vaddq_u16(v_lane0_, v_lane1_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane2_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane3_); + + v_dst.val[0] = areaDownsamplingDivision( + vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)), + vmovn_u32(vpaddlq_u16(v_lane0_)))); + } + + // channel 1 + { + uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[1]); + uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[1]); + uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[1]); + uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[1]); + v_lane0 = vaddq_u16(v_lane0, v_lane1); + v_lane0 = vaddq_u16(v_lane0, v_lane2); + v_lane0 = vaddq_u16(v_lane0, v_lane3); + + uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[1]); + uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[1]); + uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[1]); + uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[1]); + v_lane0_ = vaddq_u16(v_lane0_, v_lane1_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane2_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane3_); + + v_dst.val[1] = areaDownsamplingDivision( + vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)), + vmovn_u32(vpaddlq_u16(v_lane0_)))); + } + + // channel 2 + { + uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[2]); + uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[2]); + uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[2]); + uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[2]); + v_lane0 = vaddq_u16(v_lane0, v_lane1); + v_lane0 = vaddq_u16(v_lane0, v_lane2); + v_lane0 = vaddq_u16(v_lane0, v_lane3); + + uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[2]); + uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[2]); + uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[2]); + uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[2]); + v_lane0_ = vaddq_u16(v_lane0_, v_lane1_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane2_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane3_); + + v_dst.val[2] = areaDownsamplingDivision( + vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)), + vmovn_u32(vpaddlq_u16(v_lane0_)))); + } + + vst3_u8(dst_row + dj, v_dst); + } +#endif + + for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 12) + { + dst_row[dj ] = areaDownsamplingDivision( + (u16)src0_row[sj ] + src0_row[sj + 3] + + src0_row[sj + 6] + src0_row[sj + 9] + + src1_row[sj ] + src1_row[sj + 3] + + src1_row[sj + 6] + src1_row[sj + 9] + + src2_row[sj ] + src2_row[sj + 3] + + src2_row[sj + 6] + src2_row[sj + 9] + + src3_row[sj ] + src3_row[sj + 3] + + src3_row[sj + 6] + src3_row[sj + 9]); + + dst_row[dj + 1] = areaDownsamplingDivision( + (u16)src0_row[sj + 1] + src0_row[sj + 4] + + src0_row[sj + 7] + src0_row[sj + 10] + + src1_row[sj + 1] + src1_row[sj + 4] + + src1_row[sj + 7] + src1_row[sj + 10] + + src2_row[sj + 1] + src2_row[sj + 4] + + src2_row[sj + 7] + src2_row[sj + 10] + + src3_row[sj + 1] + src3_row[sj + 4] + + src3_row[sj + 7] + src3_row[sj + 10]); + + dst_row[dj + 2] = areaDownsamplingDivision( + (u16)src0_row[sj + 2] + src0_row[sj + 5] + + src0_row[sj + 8] + src0_row[sj + 11] + + src1_row[sj + 2] + src1_row[sj + 5] + + src1_row[sj + 8] + src1_row[sj + 11] + + src2_row[sj + 2] + src2_row[sj + 5] + + src2_row[sj + 8] + src2_row[sj + 11] + + src3_row[sj + 2] + src3_row[sj + 5] + + src3_row[sj + 8] + src3_row[sj + 11]); + } + } + } + } +#else + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)wr; + (void)hr; +#endif + (void)ssize; +} + +void resizeAreaOpenCV(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + resizeAreaRounding(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels); +} + +void resizeArea(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + resizeAreaRounding(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels); +} + +#ifdef CAROTENE_NEON + +namespace { + +uint8x8_t resizeLinearStep(uint8x16_t vr1, uint8x16_t vr2, + uint8x8_t vlutl, uint8x8_t vluth, + float32x4_t vrw, float32x4_t vcw0, float32x4_t vcw1) +{ + uint8x8_t vr1l = internal::vqtbl1_u8(vr1, vlutl); + uint8x8_t vr1h = internal::vqtbl1_u8(vr1, vluth); + uint8x8_t vr2l = internal::vqtbl1_u8(vr2, vlutl); + uint8x8_t vr2h = internal::vqtbl1_u8(vr2, vluth); + + uint16x8_t v1hw = vmovl_u8(vr1h); + uint16x8_t v2hw = vmovl_u8(vr2h); + + int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h)); + int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h)); + + float32x4_t v1L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1hw))); + float32x4_t v1H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1hw))); + float32x4_t v2L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v2hw))); + float32x4_t v2H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v2hw))); + + v1L = vmlaq_f32(v1L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v1df))), vcw0); + v1H = vmlaq_f32(v1H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v1df))), vcw1); + v2L = vmlaq_f32(v2L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v2df))), vcw0); + v2H = vmlaq_f32(v2H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v2df))), vcw1); + + float32x4_t vdiffL = vsubq_f32(v1L, v2L); + float32x4_t vdiffH = vsubq_f32(v1H, v2H); + + float32x4_t vL = vmlaq_f32(v2L, vdiffL, vrw); + float32x4_t vH = vmlaq_f32(v2H, vdiffH, vrw); + uint16x4_t vL_ = vmovn_u32(vcvtq_u32_f32(vL)); + uint16x4_t vH_ = vmovn_u32(vcvtq_u32_f32(vH)); + return vmovn_u16(vcombine_u16(vL_, vH_)); +} + +} // namespace + +namespace { + +void resize_bilinear_rows(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 hr, const u8** gcols, u8* gcweight, u8* buf) +{ + f32 scale_y_offset = 0.5f * hr - 0.5f; + + size_t dst_h8 = dsize.height & ~7; + size_t dst_w8 = dsize.width & ~7; + size_t src_w8 = ssize.width & ~7; + + size_t r = 0; + for (; r < dst_h8; r += 8) + { +resize8u_xystretch: + const u8* rows[16]; + u8 rweight[8]; + + for (u32 i = 0; i < 8; ++i) + { + f32 w = (i + r) * hr + scale_y_offset; + ptrdiff_t src_row = floorf(w); + ptrdiff_t src_row2 = src_row + 1; + + rweight[i] = (u8)((src_row2-w) * 128); + + if (src_row < 0) + src_row = 0; + if (src_row2 >= (ptrdiff_t)ssize.height) + src_row2 = ssize.height-1; + + rows[2 * i] = srcBase + src_row * srcStride; + rows[2 * i + 1] = srcBase + src_row2 * srcStride; + } + + uint8x8_t vr0w = vdup_n_u8(rweight[0]); + uint8x8_t vr1w = vdup_n_u8(rweight[1]); + uint8x8_t vr2w = vdup_n_u8(rweight[2]); + uint8x8_t vr3w = vdup_n_u8(rweight[3]); + uint8x8_t vr4w = vdup_n_u8(rweight[4]); + uint8x8_t vr5w = vdup_n_u8(rweight[5]); + uint8x8_t vr6w = vdup_n_u8(rweight[6]); + uint8x8_t vr7w = vdup_n_u8(rweight[7]); + + uint8x8_t vr0w2 = vdup_n_u8(128 - rweight[0]); + uint8x8_t vr1w2 = vdup_n_u8(128 - rweight[1]); + uint8x8_t vr2w2 = vdup_n_u8(128 - rweight[2]); + uint8x8_t vr3w2 = vdup_n_u8(128 - rweight[3]); + uint8x8_t vr4w2 = vdup_n_u8(128 - rweight[4]); + uint8x8_t vr5w2 = vdup_n_u8(128 - rweight[5]); + uint8x8_t vr6w2 = vdup_n_u8(128 - rweight[6]); + uint8x8_t vr7w2 = vdup_n_u8(128 - rweight[7]); + + size_t col = 0; + for(; col < src_w8; col += 8) + { + internal::prefetch(rows[3] + col); + internal::prefetch(rows[7] + col); + internal::prefetch(rows[11] + col); + internal::prefetch(rows[15] + col); +resize8u_ystretch: + uint8x8_t vsrc0l1 = vld1_u8(rows[0] + col); + uint8x8_t vsrc0l2 = vld1_u8(rows[1] + col); + uint8x8_t vsrc1l1 = vld1_u8(rows[2] + col); + uint8x8_t vsrc1l2 = vld1_u8(rows[3] + col); + + // (l1 * w + l2 * (128 - w) + 64) / 128 + uint16x8_t vdst0l = vmull_u8(vsrc0l1, vr0w); + uint16x8_t vdst1l = vmull_u8(vsrc1l1, vr1w); + + uint8x8_t vsrc2l1 = vld1_u8(rows[4] + col); + uint8x8_t vsrc2l2 = vld1_u8(rows[5] + col); + uint8x8_t vsrc3l1 = vld1_u8(rows[6] + col); + uint8x8_t vsrc3l2 = vld1_u8(rows[7] + col); + + vdst0l = vmlal_u8(vdst0l, vsrc0l2, vr0w2); + vdst1l = vmlal_u8(vdst1l, vsrc1l2, vr1w2); + uint16x8_t vdst2l = vmull_u8(vsrc2l1, vr2w); + uint16x8_t vdst3l = vmull_u8(vsrc3l1, vr3w); + + uint8x8_t vsrc4l1 = vld1_u8(rows[8] + col); + uint8x8_t vsrc4l2 = vld1_u8(rows[9] + col); + uint8x8_t vsrc5l1 = vld1_u8(rows[10] + col); + uint8x8_t vsrc5l2 = vld1_u8(rows[11] + col); + + vdst2l = vmlal_u8(vdst2l, vsrc2l2, vr2w2); + vdst3l = vmlal_u8(vdst3l, vsrc3l2, vr3w2); + uint16x8_t vdst4l = vmull_u8(vsrc4l1, vr4w); + uint16x8_t vdst5l = vmull_u8(vsrc5l1, vr5w); + + uint8x8_t vsrc6l1 = vld1_u8(rows[12] + col); + uint8x8_t vsrc6l2 = vld1_u8(rows[13] + col); + uint8x8_t vsrc7l1 = vld1_u8(rows[14] + col); + uint8x8_t vsrc7l2 = vld1_u8(rows[15] + col); + + uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7); + uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7); + vdst4l = vmlal_u8(vdst4l, vsrc4l2, vr4w2); + vdst5l = vmlal_u8(vdst5l, vsrc5l2, vr5w2); + uint16x8_t vdst6l = vmull_u8(vsrc6l1, vr6w); + uint16x8_t vdst7l = vmull_u8(vsrc7l1, vr7w); + + uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7); + uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7); + vdst6l = vmlal_u8(vdst6l, vsrc6l2, vr6w2); + vdst7l = vmlal_u8(vdst7l, vsrc7l2, vr7w2); + + uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7); + uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7); + uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7); + uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7); + + // == 8x8 matrix transpose == + + //00 01 02 03 04 05 06 07 d0 + //10 11 12 13 14 15 16 17 d1 + //20 21 22 23 24 25 26 27 d2 + //30 31 32 33 34 35 36 37 d3 + //40 41 42 43 44 45 46 47 d4 + //50 51 52 53 54 55 56 57 d5 + //60 61 62 63 64 65 66 67 d6 + //70 71 72 73 74 75 76 77 d7 + + uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1); + uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3); + uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5); + uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7); + + uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]); + uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]); + uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]); + uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]); + + //00 10 02 12 04 14 06 16 d0 + //01 11 03 13 05 15 07 17 d1 + //20 30 22 32 24 34 26 36 d2 + //21 31 23 33 25 35 27 37 d3 + //40 50 42 52 44 54 46 56 d4 + //41 51 43 53 45 55 47 57 d5 + //60 70 62 72 64 74 66 76 d6 + //61 71 63 73 65 75 67 77 d7 + + uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2); + uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6); + + //00 10 20 30 04 14 24 34 d0 + //01 11 21 31 05 15 25 35 d1 + //02 12 22 32 06 16 26 36 d2 + //03 13 23 33 07 17 27 37 d3 + //40 50 60 70 44 54 64 74 d4 + //41 51 61 71 45 55 65 75 d5 + //42 52 62 72 46 56 66 76 d6 + //43 53 63 73 47 57 67 77 d7 + + uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]); + uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]); + + //00 10 20 30 40 50 60 70 d0 + //01 11 21 31 41 51 61 71 d1 + //02 12 22 32 42 52 62 72 d2 + //03 13 23 33 43 53 63 73 d3 + //04 14 24 34 44 54 64 74 d4 + //05 15 25 35 45 55 65 75 d5 + //06 16 26 36 46 56 66 76 d6 + //07 17 27 37 47 57 67 77 d7 + + vst1q_u8(buf + col * 8 + 0, (uint8x16_t)vq2q0t.val[0]); + vst1q_u8(buf + col * 8 + 16, (uint8x16_t)vq3q1t.val[0]); + vst1q_u8(buf + col * 8 + 32, (uint8x16_t)vq2q0t.val[1]); + vst1q_u8(buf + col * 8 + 48, (uint8x16_t)vq3q1t.val[1]); + } + + if (col < ssize.width) + { + col = ssize.width - 8; + goto resize8u_ystretch; + } + + u8* dst_data = dstBase + r * dstStride; + const u8** cols = gcols; + u8* cweight = gcweight; + + size_t dcol = 0; + for (; dcol < dst_w8; dcol += 8, cols += 16, cweight += 8) + { + internal::prefetch(cols[0], 64*4); +resize8u_xstretch: + uint8x8_t vc0w = vdup_n_u8(cweight[0]); + uint8x8_t vc1w = vdup_n_u8(cweight[1]); + uint8x8_t vc2w = vdup_n_u8(cweight[2]); + uint8x8_t vc3w = vdup_n_u8(cweight[3]); + uint8x8_t vc4w = vdup_n_u8(cweight[4]); + uint8x8_t vc5w = vdup_n_u8(cweight[5]); + uint8x8_t vc6w = vdup_n_u8(cweight[6]); + uint8x8_t vc7w = vdup_n_u8(cweight[7]); + + uint8x8_t vc0w2 = vdup_n_u8(128 - cweight[0]); + uint8x8_t vc1w2 = vdup_n_u8(128 - cweight[1]); + uint8x8_t vc2w2 = vdup_n_u8(128 - cweight[2]); + uint8x8_t vc3w2 = vdup_n_u8(128 - cweight[3]); + uint8x8_t vc4w2 = vdup_n_u8(128 - cweight[4]); + uint8x8_t vc5w2 = vdup_n_u8(128 - cweight[5]); + uint8x8_t vc6w2 = vdup_n_u8(128 - cweight[6]); + uint8x8_t vc7w2 = vdup_n_u8(128 - cweight[7]); + + uint8x8_t vsrc0l1 = vld1_u8(cols[0]); + uint8x8_t vsrc0l2 = vld1_u8(cols[1]); + uint8x8_t vsrc1l1 = vld1_u8(cols[2]); + uint8x8_t vsrc1l2 = vld1_u8(cols[3]); + uint8x8_t vsrc2l1 = vld1_u8(cols[4]); + uint8x8_t vsrc2l2 = vld1_u8(cols[5]); + uint8x8_t vsrc3l1 = vld1_u8(cols[6]); + uint8x8_t vsrc3l2 = vld1_u8(cols[7]); + uint8x8_t vsrc4l1 = vld1_u8(cols[8]); + uint8x8_t vsrc4l2 = vld1_u8(cols[9]); + uint8x8_t vsrc5l1 = vld1_u8(cols[10]); + uint8x8_t vsrc5l2 = vld1_u8(cols[11]); + uint8x8_t vsrc6l1 = vld1_u8(cols[12]); + uint8x8_t vsrc6l2 = vld1_u8(cols[13]); + uint8x8_t vsrc7l1 = vld1_u8(cols[14]); + uint8x8_t vsrc7l2 = vld1_u8(cols[15]); + + // (l1 * w + l2 * (128 - w) + 64) / 128 + uint16x8_t vdst0l = vmull_u8(vsrc0l1, vc0w); + uint16x8_t vdst1l = vmull_u8(vsrc1l1, vc1w); + uint16x8_t vdst2l = vmull_u8(vsrc2l1, vc2w); + uint16x8_t vdst3l = vmull_u8(vsrc3l1, vc3w); + uint16x8_t vdst4l = vmull_u8(vsrc4l1, vc4w); + uint16x8_t vdst5l = vmull_u8(vsrc5l1, vc5w); + uint16x8_t vdst6l = vmull_u8(vsrc6l1, vc6w); + uint16x8_t vdst7l = vmull_u8(vsrc7l1, vc7w); + + vdst0l = vmlal_u8(vdst0l, vsrc0l2, vc0w2); + vdst1l = vmlal_u8(vdst1l, vsrc1l2, vc1w2); + vdst2l = vmlal_u8(vdst2l, vsrc2l2, vc2w2); + vdst3l = vmlal_u8(vdst3l, vsrc3l2, vc3w2); + vdst4l = vmlal_u8(vdst4l, vsrc4l2, vc4w2); + vdst5l = vmlal_u8(vdst5l, vsrc5l2, vc5w2); + vdst6l = vmlal_u8(vdst6l, vsrc6l2, vc6w2); + vdst7l = vmlal_u8(vdst7l, vsrc7l2, vc7w2); + + uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7); + uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7); + uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7); + uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7); + uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7); + uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7); + uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7); + uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7); + + // == 8x8 matrix transpose == + uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1); + uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3); + uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5); + uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7); + uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]); + uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]); + uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]); + uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]); + uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2); + uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6); + uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]); + uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]); + + //save results + vst1_u8(dst_data + 0 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[0])); + vst1_u8(dst_data + 1 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[0])); + vst1_u8(dst_data + 2 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[0])); + vst1_u8(dst_data + 3 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[0])); + vst1_u8(dst_data + 4 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[1])); + vst1_u8(dst_data + 5 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[1])); + vst1_u8(dst_data + 6 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[1])); + vst1_u8(dst_data + 7 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[1])); + } + + if (dcol < dsize.width) + { + dcol = dsize.width - 8; + cols = gcols + dcol * 2; + cweight = gcweight + dcol; + goto resize8u_xstretch; + } + } + + if (r < dsize.height) + { + r = dsize.height - 8; + goto resize8u_xystretch; + } +} + +template struct resizeLinearInternals; +template <> struct resizeLinearInternals<1> +{ + int32x4_t vc_upd; + int32x4_t vc0; + int32x4_t vcmax; + + inline resizeLinearInternals(int32x4_t & vi, u32 srccols) + { + vc_upd = vdupq_n_s32(4); + vc0 = vdupq_n_s32(0); + vcmax = vdupq_n_s32(srccols-1); + + s32 tmp0123[] = {0, 1, 2, 3 }; + vi = vld1q_s32(tmp0123); + } + inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl) + { + vsrch = vminq_s32(vsrch, vcmax); + vsrcl = vmaxq_s32(vsrcl, vc0); + vsrcl = vminq_s32(vsrcl, vcmax);//for safe tail + vsrch = vshlq_n_s32(vsrch, 3); + vsrcl = vshlq_n_s32(vsrcl, 3); + vi = vaddq_s32(vi, vc_upd); + } +}; +template <> struct resizeLinearInternals<4> +{ + int32x4_t vc_upd; + int32x4_t vc0; + int32x4_t vcmax; + int32x4_t v0123x8; + + inline resizeLinearInternals(int32x4_t & vi, u32 srccols) + { + vc_upd = vdupq_n_s32(1); + vc0 = vdupq_n_s32(0); + vcmax = vdupq_n_s32(srccols-1); + s32 tmp0123x8[] = {0, 8, 16, 24}; + v0123x8 = vld1q_s32(tmp0123x8); + + vi = vc0; + } + inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl) + { + vsrch = vminq_s32(vsrch, vcmax); + vsrcl = vmaxq_s32(vsrcl, vc0); + vsrch = vshlq_n_s32(vsrch, 5); + vsrcl = vshlq_n_s32(vsrcl, 5); + vsrch = vaddq_s32(vsrch, v0123x8); + vsrcl = vaddq_s32(vsrcl, v0123x8); + vi = vaddq_s32(vi, vc_upd); + } +}; + +template +void resizeLinearOpenCVchan(const Size2D &_ssize, const Size2D &_dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr) +{ + float scale_x_offset = 0.5f * wr - 0.5f; + + Size2D ssize(_ssize.width*channels, _ssize.height); + Size2D dsize(_dsize.width*channels, _dsize.height); + + std::vector gcweight((dsize.width + 7) & ~7); + std::vector gcols(((dsize.width + 7) & ~7) * 2); + std::vector buf(((ssize.width + 7) & ~7) * 8); // (8 rows) x (width of src) + + float32x4_t vscale_x = vdupq_n_f32(wr); + float32x4_t vscale_x_offset = vdupq_n_f32(scale_x_offset); + int32x4_t vc1 = vdupq_n_s32(1); + float32x4_t vc128f = vdupq_n_f32(128.0f); + + int32x4_t vi; + resizeLinearInternals indexes(vi, _ssize.width);//u32 is used to store indexes + //so we could get issues on src image dimensions greater than (2^32-1) + + for (size_t dcol = 0; dcol < dsize.width; dcol += 8) + { + s32 idx[16]; + + float32x4_t vif = vcvtq_f32_s32(vi); + float32x4_t vw = vmlaq_f32(vscale_x_offset, vscale_x, vif); + int32x4_t vwi = vcvtq_s32_f32(vw); + float32x4_t vwif = vcvtq_f32_s32(vwi); + int32x4_t vmask = (int32x4_t)vcltq_f32(vwif, vw); + int32x4_t vsrch = vsubq_s32(vwi, vmask); + int32x4_t vsrcl = vsubq_s32(vsrch, vc1); + float32x4_t vsrchf = vcvtq_f32_s32(vsrch); + float32x4_t vw2 = vsubq_f32(vsrchf, vw); + + vw2 = vmulq_f32(vw2, vc128f); + uint32x4_t vw32u = vcvtq_u32_f32(vw2); + uint16x4_t vw16ul = vmovn_u32(vw32u); + indexes.updateIndexes(vi, vsrch, vsrcl); + + vst1q_s32(idx + 0, vsrcl); + vst1q_s32(idx + 8, vsrch); + + vif = vcvtq_f32_s32(vi); + vw = vmlaq_f32(vscale_x_offset, vscale_x, vif); + vwi = vcvtq_s32_f32(vw); + vwif = vcvtq_f32_s32(vwi); + vmask = (int32x4_t)vcltq_f32(vwif, vw); + vsrch = vsubq_s32(vwi, vmask); + vsrcl = vsubq_s32(vsrch, vc1); + vsrchf = vcvtq_f32_s32(vsrch); + vw2 = vsubq_f32(vsrchf, vw); + + vw2 = vmulq_f32(vw2, vc128f); + vw32u = vcvtq_u32_f32(vw2); + indexes.updateIndexes(vi, vsrch, vsrcl); + + uint16x4_t vw16uh = vmovn_u32(vw32u); + + vst1q_s32(idx + 4, vsrcl); + vst1q_s32(idx + 12, vsrch); + + uint8x8_t vw8u = vmovn_u16(vcombine_u16(vw16ul, vw16uh)); + + for (u32 i = 0; i < 8; ++i) + { + gcols[dcol * 2 + i*2] = &buf[idx[i]]; + gcols[dcol * 2 + i*2 + 1] = &buf[idx[i + 8]]; + } + + vst1_u8(&gcweight[dcol], vw8u); + } + + resize_bilinear_rows(ssize, dsize, srcBase, srcStride, dstBase, dstStride, hr, &gcols[0], &gcweight[0], &buf[0]); +} + +void downsample_bilinear_8uc1(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr) +{ + internal::assertSupportedConfiguration(wr <= 2.f && hr <= 2.f); + + enum { SHIFT_BITS = 11 }; + + f32 scale_x_offset = 0.5f * wr - 0.5f; + f32 scale_y_offset = 0.5f * hr - 0.5f; + + std::vector _buf(dsize.height*(2*(sizeof(ptrdiff_t)/sizeof(s32))+1)+1); + ptrdiff_t* buf = (ptrdiff_t*)&_buf[0]; + s32* buf2 = (s32*)buf+2*(sizeof(ptrdiff_t)/sizeof(s32))*dsize.height; + for(size_t row = 0; row < (size_t)dsize.height; ++row) + { + f32 r = row * hr + scale_y_offset; + ptrdiff_t src_row = floorf(r); + ptrdiff_t src_row2 = src_row + 1; + + f32 rweight = src_row2 - r; + buf2[row] = floorf(rweight * (1 << SHIFT_BITS) + 0.5f); + buf[0 * dsize.height + row] = std::max(0, src_row); + buf[1 * dsize.height + row] = std::min((ptrdiff_t)ssize.height-1, src_row2); + } + +#define USE_CORRECT_VERSION 0 + + ptrdiff_t col = 0; +/***********************************************/ + for(; col <= (ptrdiff_t)dsize.width-16; col+=16) + { + ptrdiff_t col1[16]; + ptrdiff_t col2[16]; + s16 cwi[16]; + + for(s32 k = 0; k < 16; ++k) + { + f32 c = (col + k) * wr + scale_x_offset; + col1[k] = (ptrdiff_t)c; + col2[k] = col1[k] + 1; + + cwi[k] = (short)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f); + + if(col1[k] < 0) col1[k] = 0; + if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = ssize.width-1; + } + + ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16); + ptrdiff_t y = std::min(col1[8], (ptrdiff_t)ssize.width-16); + u8 lutl[16]; + u8 luth[16]; + for(s32 k = 0; k < 8; ++k) + { + lutl[k] = (u8)(col1[k] - x); + luth[k] = (u8)(col2[k] - x); + lutl[k+8] = (u8)(col1[k+8] - y); + luth[k+8] = (u8)(col2[k+8] - y); + } + + uint8x8_t vlutl = vld1_u8(lutl); + uint8x8_t vluth = vld1_u8(luth); + int16x8_t vcw = vld1q_s16(cwi); + + uint8x8_t vlutl_ = vld1_u8(lutl+8); + uint8x8_t vluth_ = vld1_u8(luth+8); + int16x8_t vcw_ = vld1q_s16(cwi+8); + + for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row) + { +#if USE_CORRECT_VERSION + int32x4_t vrw = vdupq_n_s32(buf2[row]); +#else + int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]); + int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row])); +#endif + + internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride); + internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride); + + { + union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) }; + union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) }; + + uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl); + uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth); + uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl); + uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth); + + uint16x8_t v1hw = vmovl_u8(vr1h); + uint16x8_t v2hw = vmovl_u8(vr2h); + + int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h)); + int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h)); + + int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS)); + int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS)); + int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS)); + int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS)); + + v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw)); + v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw)); + v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw)); + v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw)); + +#if USE_CORRECT_VERSION + /* correct version */ + int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS); + int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS); + int32x4_t vdiffL = vsubq_s32(v1L, v2L); + int32x4_t vdiffH = vsubq_s32(v1H, v2H); + + vL = vmlaq_s32(vL, vdiffL, vrw); + vH = vmlaq_s32(vH, vdiffH, vrw); + uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8); + uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8); + uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8); + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); +#else + /* ugly version matching to OpenCV's SSE optimization */ + int16x4_t v1Ls = vshrn_n_s32(v1L, 5); + int16x4_t v1Hs = vshrn_n_s32(v1H, 5); + int16x4_t v2Ls = vshrn_n_s32(v2L, 5); + int16x4_t v2Hs = vshrn_n_s32(v2H, 5); + + int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); + int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); + + int16x8_t vsum = vaddq_s16(v1s, v2s); + uint8x8_t vres = vqrshrun_n_s16(vsum, 2); + + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); +#endif + } + + { + union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + y) }; + union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + y) }; + + uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl_); + uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth_); + uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl_); + uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth_); + + uint16x8_t v1hw = vmovl_u8(vr1h); + uint16x8_t v2hw = vmovl_u8(vr2h); + + int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h)); + int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h)); + + int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS)); + int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS)); + int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS)); + int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS)); + + v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw_)); + v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw_)); + v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw_)); + v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw_)); + +#if USE_CORRECT_VERSION + /* correct version */ + int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS); + int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS); + int32x4_t vdiffL = vsubq_s32(v1L, v2L); + int32x4_t vdiffH = vsubq_s32(v1H, v2H); + + vL = vmlaq_s32(vL, vdiffL, vrw); + vH = vmlaq_s32(vH, vdiffH, vrw); + uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8); + uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8); + uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8); + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres); +#else + /* ugly version matching to OpenCV's SSE optimization */ + int16x4_t v1Ls = vshrn_n_s32(v1L, 5); + int16x4_t v1Hs = vshrn_n_s32(v1H, 5); + int16x4_t v2Ls = vshrn_n_s32(v2L, 5); + int16x4_t v2Hs = vshrn_n_s32(v2H, 5); + + int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); + int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); + + int16x8_t vsum = vaddq_s16(v1s, v2s); + uint8x8_t vres = vqrshrun_n_s16(vsum, 2); + + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres); +#endif + } + } + } +/***********************************************/ + for(; col <= (ptrdiff_t)dsize.width-8; col+=8) + { +downsample_bilinear_8uc1_col_loop8: + ptrdiff_t col1[8]; + ptrdiff_t col2[8]; + s16 cwi[8]; + + for(s32 k = 0; k < 8; ++k) + { + f32 c = (col + k) * wr + scale_x_offset; + col1[k] = (ptrdiff_t)c; + col2[k] = col1[k] + 1; + + cwi[k] = (s16)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f); + + if(col1[k] < 0) col1[k] = 0; + if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = (ptrdiff_t)ssize.width-1; + } + + ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16); + u8 lutl[8]; + u8 luth[8]; + for(s32 k = 0; k < 8; ++k) + { + lutl[k] = (u8)(col1[k] - x); + luth[k] = (u8)(col2[k] - x); + } + + uint8x8_t vlutl = vld1_u8(lutl); + uint8x8_t vluth = vld1_u8(luth); + int16x8_t vcw = vld1q_s16(cwi); + + for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row) + { +#if USE_CORRECT_VERSION + int32x4_t vrw = vdupq_n_s32(buf2[row]); +#else + int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]); + int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row])); +#endif + + internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride); + internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride); + + union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) }; + union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) }; + + uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl); + uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth); + uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl); + uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth); + + uint16x8_t v1hw = vmovl_u8(vr1h); + uint16x8_t v2hw = vmovl_u8(vr2h); + + int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h)); + int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h)); + + int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS)); + int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS)); + int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS)); + int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS)); + + v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw)); + v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw)); + v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw)); + v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw)); + +#if USE_CORRECT_VERSION + /* correct version */ + int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS); + int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS); + int32x4_t vdiffL = vsubq_s32(v1L, v2L); + int32x4_t vdiffH = vsubq_s32(v1H, v2H); + + vL = vmlaq_s32(vL, vdiffL, vrw); + vH = vmlaq_s32(vH, vdiffH, vrw); + uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8); + uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8); + uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8); + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); +#else + /* ugly version matching to OpenCV's SSE optimization */ + int16x4_t v1Ls = vshrn_n_s32(v1L, 5); + int16x4_t v1Hs = vshrn_n_s32(v1H, 5); + int16x4_t v2Ls = vshrn_n_s32(v2L, 5); + int16x4_t v2Hs = vshrn_n_s32(v2H, 5); + + int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); + int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); + + int16x8_t vsum = vaddq_s16(v1s, v2s); + uint8x8_t vres = vqrshrun_n_s16(vsum, 2); + + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); +#endif + } + } + if (col < (ptrdiff_t)dsize.width) + { + col = dsize.width - 8; + goto downsample_bilinear_8uc1_col_loop8; + } +} + +} // namespace + +#endif + +void resizeLinearOpenCV(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + internal::assertSupportedConfiguration(wr > 0 && hr > 0 && + (dsize.width - 0.5) * wr - 0.5 < ssize.width && + (dsize.height - 0.5) * hr - 0.5 < ssize.height && // Ensure we have enough source data + (dsize.width + 0.5) * wr + 0.5 >= ssize.width && + (dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big + isResizeLinearOpenCVSupported(ssize, dsize, channels)); +#ifdef CAROTENE_NEON + if(1 == channels) + { + if (wr <= 1.f && hr <= 1.f) + resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr); + else if (wr <= 2.0f && hr <= 2.0f && ssize.width >= 16) + downsample_bilinear_8uc1(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr); + else + resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr); + } + else if(4 == channels) + resizeLinearOpenCVchan<4>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr); +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)wr; + (void)hr; + (void)channels; +#endif +} + +void resizeLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + internal::assertSupportedConfiguration(wr > 0 && hr > 0 && + (dsize.width - 0.5) * wr - 0.5 < ssize.width && + (dsize.height - 0.5) * hr - 0.5 < ssize.height && // Ensure we have enough source data + (dsize.width + 0.5) * wr + 0.5 >= ssize.width && + (dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big + isResizeLinearSupported(ssize, dsize, + wr, hr, channels)); +#ifdef CAROTENE_NEON + f32 scale_x = wr; + f32 scale_x_offset = 0.5f * scale_x - 0.5f; + f32 scale_y = hr; + f32 scale_y_offset = 0.5f * scale_y - 0.5f; + + std::vector _buf(dsize.height * 3 + 1); + std::vector coeff(dsize.height); + ptrdiff_t * buf = &_buf[0]; + + for (size_t row = 0; row < dsize.height; ++row) + { + f32 r = row * scale_y + scale_y_offset; + ptrdiff_t src_row = floorf(r); + ptrdiff_t src_row2 = src_row + 1; + + f32 rweight = src_row2 - r; + buf[0 * dsize.height + row] = std::max(0, src_row); + buf[1 * dsize.height + row] = std::min(ssize.height - 1, src_row2); + coeff[row] = rweight; + } + + size_t col = 0; + for ( ; col + 16 <= dsize.width; col += 16) + { + ptrdiff_t col1[16], col2[16]; + f32 cwi[16]; + + for(s32 k = 0; k < 16; ++k) + { + f32 c = (col + k) * scale_x + scale_x_offset; + col1[k] = floorf(c); + col2[k] = col1[k] + 1; + + cwi[k] = col2[k] - c; + + if (col1[k] < 0) + col1[k] = 0; + if (col2[k] >= (ptrdiff_t)ssize.width) + col2[k] = ssize.width - 1; + } + + ptrdiff_t x = std::min(col1[0], ssize.width - 16); + ptrdiff_t y = std::min(col1[8], ssize.width - 16); + u8 lutl[16], luth[16]; + + for (s32 k = 0; k < 8; ++k) + { + lutl[k] = (u8)(col1[k] - x); + luth[k] = (u8)(col2[k] - x); + lutl[k + 8] = (u8)(col1[k + 8] - y); + luth[k + 8] = (u8)(col2[k + 8] - y); + } + + uint8x8_t vlutl = vld1_u8(lutl); + uint8x8_t vluth = vld1_u8(luth); + float32x4_t vcw0 = vld1q_f32(cwi); + float32x4_t vcw1 = vld1q_f32(cwi + 4); + + uint8x8_t vlutl_ = vld1_u8(lutl + 8); + uint8x8_t vluth_ = vld1_u8(luth + 8); + float32x4_t vcw0_ = vld1q_f32(cwi + 8); + float32x4_t vcw1_ = vld1q_f32(cwi + 12); + + if (channels == 1) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x8_t vres0 = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x), + vlutl, vluth, + vrw, vcw0, vcw1); + + uint8x8_t vres1 = resizeLinearStep(vld1q_u8(srow0 + y), vld1q_u8(srow1 + y), + vlutl_, vluth_, + vrw, vcw0_, vcw1_); + + vst1q_u8(drow + col, vcombine_u8(vres0, vres1)); + } + } + else if (channels == 3) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x16x3_t v_src10 = vld3q_u8(srow0 + (x * 3)); + uint8x16x3_t v_src20 = vld3q_u8(srow1 + (x * 3)); + + uint8x16x3_t v_src11 = vld3q_u8(srow0 + (y * 3)); + uint8x16x3_t v_src21 = vld3q_u8(srow1 + (y * 3)); + + uint8x16x3_t v_dst; + + v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + + vst3q_u8(drow + (col * 3), v_dst); + } + } + else if (channels == 4) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x16x4_t v_src10 = vld4q_u8(srow0 + (x << 2)); + uint8x16x4_t v_src20 = vld4q_u8(srow1 + (x << 2)); + + uint8x16x4_t v_src11 = vld4q_u8(srow0 + (y << 2)); + uint8x16x4_t v_src21 = vld4q_u8(srow1 + (y << 2)); + + uint8x16x4_t v_dst; + + v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[3] = vcombine_u8(resizeLinearStep(v_src10.val[3], v_src20.val[3], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[3], v_src21.val[3], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + + vst4q_u8(drow + (col << 2), v_dst); + } + } + } + + for ( ; col + 8 <= dsize.width; col += 8) + { +downsample_bilinear_8uc1_col_loop8: + ptrdiff_t col1[8], col2[8]; + f32 cwi[8]; + + for (s32 k = 0; k < 8; ++k) + { + f32 c = (col + k) * scale_x + scale_x_offset; + col1[k] = floorf(c); + col2[k] = col1[k] + 1; + + cwi[k] = col2[k] - c; + + if (col1[k] < 0) + col1[k] = 0; + if (col2[k] >= (ptrdiff_t)ssize.width) + col2[k] = ssize.width - 1; + } + + ptrdiff_t x = std::min(col1[0], ssize.width - 16); + u8 lutl[8], luth[8]; + for (s32 k = 0; k < 8; ++k) + { + lutl[k] = (u8)(col1[k] - x); + luth[k] = (u8)(col2[k] - x); + } + + uint8x8_t vlutl = vld1_u8(lutl); + uint8x8_t vluth = vld1_u8(luth); + float32x4_t vcw0 = vld1q_f32(cwi); + float32x4_t vcw1 = vld1q_f32(cwi + 4); + + if (channels == 1) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x8_t vres = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x), + vlutl, vluth, + vrw, vcw0, vcw1); + vst1_u8(drow + col, vres); + } + } + else if (channels == 3) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x16x3_t v_src1 = vld3q_u8(srow0 + (x * 3)); + uint8x16x3_t v_src2 = vld3q_u8(srow1 + (x * 3)); + + uint8x8x3_t v_dst; + + v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1); + + vst3_u8(drow + (col * 3), v_dst); + } + } + else if (channels == 4) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x16x4_t v_src1 = vld4q_u8(srow0 + (x << 2)); + uint8x16x4_t v_src2 = vld4q_u8(srow1 + (x << 2)); + + uint8x8x4_t v_dst; + + v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[3] = resizeLinearStep(v_src1.val[3], v_src2.val[3], vlutl, vluth, vrw, vcw0, vcw1); + + vst4_u8(drow + (col << 2), v_dst); + } + } + } + + if (col < dsize.width) + { + col = dsize.width - 8; + goto downsample_bilinear_8uc1_col_loop8; + } + +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)wr; + (void)hr; + (void)channels; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/saturate_cast.hpp b/3rdparty/carotene/src/saturate_cast.hpp new file mode 100644 index 0000000000..98f8545009 --- /dev/null +++ b/3rdparty/carotene/src/saturate_cast.hpp @@ -0,0 +1,199 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SATURATE_CAST_HPP +#define CAROTENE_SATURATE_CAST_HPP + +#include +#include +#include + +#if defined _MSC_VER && defined _M_ARM +# include +#endif + +#include +#include + +namespace CAROTENE_NS { namespace internal { + +#if defined _MSC_VER && defined _M_ARM + +__declspec(naked) static void vcvtr_s32_f64_imp(f64 d) +{ + (void)d; + __emit(0xEEBD); // vcvtr.s32.f64 s0, d0 + __emit(0x0B40); + __emit(0xEE10); // vmov r0, s0 + __emit(0x0A10); + __emit(0x4770); // bx lr +} + +# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x); +# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x); + +#elif defined CV_ICC || defined __GNUC__ + +# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__) +# define CAROTENE_ROUND_FLT(value) { \ + register union { f32 f; s32 i; } result; \ + asm ("ftosis %0, %1 \n" : "=w" (result.f) : "w" (value) ); \ + return result.i; } +# define CAROTENE_ROUND_DBL(value) { \ + register union {f32 f; s32 i;} __tegra_result; \ + asm ( \ + "ftosid %0, %P1\n" \ + : "=w" (__tegra_result.f) \ + : "w" (value) \ + ); \ + return __tegra_result.i; \ + } +# else +# define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value); +# define CAROTENE_ROUND_DBL(value) return (s32)lrint(value); +# endif + +#endif + +inline s32 round(f32 value) +{ +#ifdef CAROTENE_ROUND_FLT + CAROTENE_ROUND_FLT(value) +#else + s32 intpart = (s32)(value); + f32 fractpart = value - intpart; + if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0)) + return (s32)(value + (value >= 0 ? 0.5 : -0.5)); + else + return intpart; +#endif +} + +inline s32 round(f64 value) +{ +#ifdef CAROTENE_ROUND_DBL + CAROTENE_ROUND_DBL(value) +#else + s32 intpart = (s32)(value); + f64 fractpart = value - intpart; + if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0)) + return (s32)(value + (value >= 0 ? 0.5 : -0.5)); + else + return intpart; +#endif +} +/////////////// saturate_cast (used in image & signal processing) /////////////////// + +template inline _Tp saturate_cast(u8 v) { return _Tp(v); } +template inline _Tp saturate_cast(s8 v) { return _Tp(v); } +template inline _Tp saturate_cast(u16 v) { return _Tp(v); } +template inline _Tp saturate_cast(s16 v) { return _Tp(v); } +template inline _Tp saturate_cast(u32 v) { return _Tp(v); } +template inline _Tp saturate_cast(s32 v) { return _Tp(v); } +template inline _Tp saturate_cast(s64 v) { return _Tp(v); } +template inline _Tp saturate_cast(u64 v) { return _Tp(v); } +template inline _Tp saturate_cast(f32 v) { return _Tp(v); } +template inline _Tp saturate_cast(f64 v) { return _Tp(v); } + +template<> inline u8 saturate_cast(s8 v) { return (u8)std::max((s32)v, 0); } +template<> inline u8 saturate_cast(u16 v) { return (u8)std::min((u32)v, (u32)UCHAR_MAX); } +template<> inline u8 saturate_cast(s32 v) { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline u8 saturate_cast(s16 v) { return saturate_cast((s32)v); } +template<> inline u8 saturate_cast(u32 v) { return (u8)std::min(v, (u32)UCHAR_MAX); } +template<> inline u8 saturate_cast(s64 v) { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline u8 saturate_cast(u64 v) { return (u8)std::min(v, (u64)UCHAR_MAX); } +template<> inline u8 saturate_cast(f32 v) { return saturate_cast(round(v)); } +template<> inline u8 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline s8 saturate_cast(u8 v) { return (s8)std::min((s32)v, SCHAR_MAX); } +template<> inline s8 saturate_cast(u16 v) { return (s8)std::min((u32)v, (u32)SCHAR_MAX); } +template<> inline s8 saturate_cast(s32 v) { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } +template<> inline s8 saturate_cast(s16 v) { return saturate_cast((s32)v); } +template<> inline s8 saturate_cast(u32 v) { return (s8)std::min(v, (u32)SCHAR_MAX); } +template<> inline s8 saturate_cast(s64 v) { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } +template<> inline s8 saturate_cast(u64 v) { return (s8)std::min(v, (u64)SCHAR_MAX); } +template<> inline s8 saturate_cast(f32 v) { return saturate_cast(round(v)); } +template<> inline s8 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline u16 saturate_cast(s8 v) { return (u16)std::max((s32)v, 0); } +template<> inline u16 saturate_cast(s16 v) { return (u16)std::max((s32)v, 0); } +template<> inline u16 saturate_cast(s32 v) { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } +template<> inline u16 saturate_cast(u32 v) { return (u16)std::min(v, (u32)USHRT_MAX); } +template<> inline u16 saturate_cast(s64 v) { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } +template<> inline u16 saturate_cast(u64 v) { return (u16)std::min(v, (u64)USHRT_MAX); } +template<> inline u16 saturate_cast(f32 v) { return saturate_cast(round(v)); } +template<> inline u16 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline s16 saturate_cast(u16 v) { return (s16)std::min((s32)v, SHRT_MAX); } +template<> inline s16 saturate_cast(s32 v) { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } +template<> inline s16 saturate_cast(u32 v) { return (s16)std::min(v, (u32)SHRT_MAX); } +template<> inline s16 saturate_cast(s64 v) { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } +template<> inline s16 saturate_cast(u64 v) { return (s16)std::min(v, (u64)SHRT_MAX); } +template<> inline s16 saturate_cast(f32 v) { return saturate_cast(round(v)); } +template<> inline s16 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline u32 saturate_cast(s8 v) { return (u32)std::max(v, (s8)0); } +template<> inline u32 saturate_cast(s16 v) { return (u32)std::max(v, (s16)0); } +template<> inline u32 saturate_cast(s32 v) { return (u32)std::max(v, (s32)0); } +template<> inline u32 saturate_cast(s64 v) { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); } +template<> inline u32 saturate_cast(u64 v) { return (u32)std::min(v, (u64)UINT_MAX); } +//OpenCV like f32/f64 -> u32 conversion +//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc. +template<> inline u32 saturate_cast(f32 v) { return round(v); } +template<> inline u32 saturate_cast(f64 v) { return round(v); } +//Negative clipping implementation +//template<> inline u32 saturate_cast(f32 v) { return saturate_cast(round(v)); } +//template<> inline u32 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline s32 saturate_cast(u32 v) { return (s32)std::min(v, (u32)INT_MAX); } +template<> inline s32 saturate_cast(s64 v) { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); } +template<> inline s32 saturate_cast(u64 v) { return (s32)std::min(v, (u64)INT_MAX); } +template<> inline s32 saturate_cast(f32 v) { return round(v); } +template<> inline s32 saturate_cast(f64 v) { return round(v); } + +template<> inline u64 saturate_cast(s8 v) { return (u64)std::max(v, (s8)0); } +template<> inline u64 saturate_cast(s16 v) { return (u64)std::max(v, (s16)0); } +template<> inline u64 saturate_cast(s32 v) { return (u64)std::max(v, (s32)0); } +template<> inline u64 saturate_cast(s64 v) { return (u64)std::max(v, (s64)0); } + +template<> inline s64 saturate_cast(u64 v) { return (s64)std::min(v, (u64)LLONG_MAX); } + +} } + +#endif diff --git a/3rdparty/carotene/src/scharr.cpp b/3rdparty/carotene/src/scharr.cpp new file mode 100644 index 0000000000..2c4ba29742 --- /dev/null +++ b/3rdparty/carotene/src/scharr.cpp @@ -0,0 +1,219 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" + +namespace CAROTENE_NS { + +bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin) +{ + return (dx == 0 && dy == 1 && + isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) || + (dx == 1 && dy == 0 && + isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin)); +} + +void Scharr3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE border, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin)); +#ifdef CAROTENE_NEON + static s16 dw[] = {3, 10, 3}; + + if (dy == 1) + SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride, + 3, 1, dw, 0, + border, borderValue, borderMargin); + else + SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride, + 1, 3, 0, dw, + border, borderValue, borderMargin); +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +void ScharrDeriv(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t colsn = size.width*cn; + size_t roiw8 = colsn > 7 ? colsn - 7 : 0; + + ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size + std::vector _tempBuf((delta << 1) + 64); + s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16); + + int16x8_t vc3 = vmovq_n_s16(3); + int16x8_t vc10 = vmovq_n_s16(10); + uint8x8_t v8c10 = vmov_n_u8(10); + + for(size_t y = 0; y < size.height; y++ ) + { + const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0); + const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0); + s16* drow = internal::getRowPtr(dstBase, dstStride, y); + + // do vertical convolution + size_t x = 0; + for( ; x < roiw8; x += 8 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); +#if __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d0}, [%[src0]] \n\t" + "vld1.8 {d2}, [%[src2]] \n\t" + "vld1.8 {d1}, [%[src1]] \n\t" + "vaddl.u8 q2, d2, d0 \n\t" + "vmull.u8 q3, d1, %[vc10] \n\t" + "vsubl.u8 q4, d2, d0 \n\t" + "vmla.s16 q3, q2, %q[vc3] \n\t" + "vst1.16 {d8-d9}, [%[out1],:128] \n\t" + "vst1.16 {d6-d7}, [%[out0],:128] \n\t" + : + : [out0] "r" (trow0 + x), + [out1] "r" (trow1 + x), + [src0] "r" (srow0 + x), + [src1] "r" (srow1 + x), + [src2] "r" (srow2 + x), + [vc10] "w" (v8c10), [vc3] "w" (vc3) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" + ); +#else + uint8x8_t s0 = vld1_u8(srow0 + x); + uint8x8_t s1 = vld1_u8(srow1 + x); + uint8x8_t s2 = vld1_u8(srow2 + x); + + int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10)); + int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0)); + int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0)); + int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3); + + vst1q_s16(trow1 + x, t1); + vst1q_s16(trow0 + x, t0); +#endif + } + for( ; x < colsn; x++ ) + { + trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10); + trow1[x] = (s16)(srow2[x] - srow0[x]); + } + + // make border + size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0); + for( s32 k = 0; k < cn; k++ ) + { + trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; + trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; + } + + // do horizontal convolution, interleave the results and store them to dst + x = 0; + for( ; x < roiw8; x += 8 ) + { +#if __GNUC_MINOR__ < 6 + __asm__ ( + "vld1.16 {d4-d5}, [%[s2ptr]] \n\t" + "vld1.16 {d8-d9}, [%[s4ptr]] \n\t" + "vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t" + "vld1.16 {d0-d1}, [%[s0ptr]] \n\t" + "vld1.16 {d2-d3}, [%[s1ptr]] \n\t" + "vadd.i16 q7, q2, q4 \n\t" + "vmul.s16 q6, q3, %q[vc10] \n\t" + "vsub.s16 q5, q1, q0 \n\t" + "vmla.s16 q6, q7, %q[vc3] \n\t" + "vst2.16 {d10-d13}, [%[out]] \n\t" + : + : [out] "r" (drow + x * 2), + [s0ptr] "r" (trow0 + x - cn), + [s1ptr] "r" (trow0 + x + cn), + [s2ptr] "r" (trow1 + x - cn), + [s3ptr] "r" (trow1 + x), + [s4ptr] "r" (trow1 + x + cn), + [vc10] "w" (vc10), [vc3] "w" (vc3) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" + ); +#else + int16x8_t s0 = vld1q_s16(trow0 + x - cn); + int16x8_t s1 = vld1q_s16(trow0 + x + cn); + int16x8_t s2 = vld1q_s16(trow1 + x - cn); + int16x8_t s3 = vld1q_s16(trow1 + x); + int16x8_t s4 = vld1q_s16(trow1 + x + cn); + + int16x8_t s3x10 = vmulq_s16(s3, vc10); + int16x8_t s24 = vaddq_s16(s2, s4); + + int16x8x2_t vr; + vr.val[0] = vsubq_s16(s1, s0); + vr.val[1] = vmlaq_s16(s3x10, s24, vc3); + + vst2q_s16(drow + x*2, vr); +#endif //__GNUC_MINOR__ < 6 + } + for( ; x < colsn; x++ ) + { + drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]); + drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10); + } + } +#else + (void)size; + (void)cn; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/separable_filter.cpp b/3rdparty/carotene/src/separable_filter.cpp new file mode 100644 index 0000000000..a06172c4e6 --- /dev/null +++ b/3rdparty/carotene/src/separable_filter.cpp @@ -0,0 +1,109 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "separable_filter.hpp" + +namespace CAROTENE_NS { + +bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin) +{ + return isSupportedConfiguration() && + size.width >= 9 && size.height >= 1 && + (size.height + borderMargin.top + borderMargin.bottom) >= 2 && + (dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REPLICATE ); +} + +void SeparableFilter3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw, + BORDER_MODE border, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin)); +#ifdef CAROTENE_NEON + if(!((xw || rowFilter < 3) && (yw || colFilter < 3))) + std::abort();//Couldn't call generic filter without provided weights + + typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t, + const s16*, const s16*, BORDER_MODE, u8, Margin); + + static sepFilter3x3_8u16s_func quickFilters[4][4]= + { + /*d0y*/{ /*d0x*/ internal::sepFilter3x3::process, + /*dx*/ internal::sepFilter3x3::process, + /*d2x*/ internal::sepFilter3x3::process, + /*dNx*/ internal::sepFilter3x3::process}, + + /*dy */{ /*d0x*/ internal::sepFilter3x3::process, + /*dx*/ internal::sepFilter3x3::process, + /*d2x*/ internal::sepFilter3x3::process, + /*dNx*/ internal::sepFilter3x3::process}, + + /*d2y*/{ /*d0x*/ internal::sepFilter3x3::process, + /*dx*/ internal::sepFilter3x3::process, + /*d2x*/ internal::sepFilter3x3::process, + /*dNx*/ internal::sepFilter3x3::process}, + + /*dNy*/{ /*d0x*/ internal::sepFilter3x3::process, + /*dx*/ internal::sepFilter3x3::process, + /*d2x*/ internal::sepFilter3x3::process, + /*dNx*/ internal::sepFilter3x3::process} + }; + + quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride, + xw, yw, border, borderValue, borderMargin); +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)xw; + (void)yw; + (void)borderValue; +#endif +} + + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/separable_filter.hpp b/3rdparty/carotene/src/separable_filter.hpp new file mode 100644 index 0000000000..b0f7307fa0 --- /dev/null +++ b/3rdparty/carotene/src/separable_filter.hpp @@ -0,0 +1,1161 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_SEPARABLE_FILTER_HPP +#define CAROTENE_SRC_SEPARABLE_FILTER_HPP + +#include "common.hpp" + +#include + +#include + +#ifdef CAROTENE_NEON + +namespace CAROTENE_NS { + +namespace internal { + +struct RowFilter3x3S16Base +{ + typedef u8 srcType; + /* + Various border types, image boundaries are denoted with '|' + + * BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh + * BORDER_REFLECT: fedcba|abcdefgh|hgfedcb + * BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba + * BORDER_WRAP: cdefgh|abcdefgh|abcdefg + * BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i' + */ + inline RowFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue, const ptrdiff_t borderxl, const ptrdiff_t borderxr): + borderType(_borderType),borderValue(_borderValue) + { + if (borderType == BORDER_MODE_CONSTANT) + { + vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x00ffFFffFFffFFffULL : 0x0100FFffFFffFFffULL)); + vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0xFF07060504030201ULL : 0x0706050403020100ULL)); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0001FFffFFffFFffULL : 0x0100FFffFFffFFffULL)); + vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0607060504030201ULL : 0x0706050403020100ULL)); + } + else //if (borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE) + { + vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL)); + vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL)); + } + lookLeft = offsetk - borderxl; + lookRight = offsetk - borderxr; + } + + uint8x8_t vfmask; + uint8x8_t vtmask; + enum { offsetk = 1}; + ptrdiff_t lookLeft; + ptrdiff_t lookRight; + const BORDER_MODE borderType; + const srcType borderValue; +}; + +struct ColFilter3x3S16Base +{ + typedef s16 srcType; + + inline ColFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue): + borderType(_borderType),borderValue(_borderValue) {} + + enum { offsetk = 1}; + const BORDER_MODE borderType; + const srcType borderValue; +}; + +struct RowFilter3x3S16Generic : public RowFilter3x3S16Base +{ + typedef s16 dstType; + + inline RowFilter3x3S16Generic(BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16 *w): + RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter( (w[0]+w[1]+w[2]) * borderValue ) + { + vw0 = vdupq_n_s16(w[0]); + vw1 = vdupq_n_s16(w[1]); + vw2 = vdupq_n_s16(w[2]); + } + + int16x8_t vw0; + int16x8_t vw1; + int16x8_t vw2; + const dstType borderFilter; + + inline void operator()(const u8* src, s16* dst, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + l = vset_lane_u8(borderValue, l, 6); + + ptrdiff_t i = 0; + for (; i < width - 16 + lookRight; i += 16) + { + internal::prefetch(src + i); + uint8x8_t l18u = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0), + vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1), + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2))); + l = vld1_u8(src + i + 9); + vst1q_s16(dst + i + 8, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 6))), vw0), + vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 7))), vw1), + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l)), vw2))); + } + if (i < width - 8 + lookRight) + { + uint8x8_t l18u = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0), + vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1), + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2))); + i += 8; + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + tail2 = vset_lane_u8(borderValue, tail2, 7); + uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); + + int16x8_t l0 = vreinterpretq_s16_u16(vmovl_u8(tail0)); + int16x8_t l1 = vreinterpretq_s16_u16(vmovl_u8(tail1)); + int16x8_t l2 = vreinterpretq_s16_u16(vmovl_u8(tail2)); + + int16x8_t l0w = vmulq_s16(l0, vw0); + int16x8_t l2w = vmulq_s16(l2, vw2); + int16x8_t ls = vaddq_s16(vmlaq_s16(l0w, l1, vw1), l2w); + + vst1q_s16(dst + (width - 8), ls); + } + } +}; + +struct RowFilter3x3S16_m101 : public RowFilter3x3S16Base +{ + typedef s16 dstType; + + inline RowFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): + RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {} + + const dstType borderFilter; + + inline void operator()(const u8* src, s16* dst, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + l = vset_lane_u8(borderValue, l, 6); + + ptrdiff_t i = 0; + for (; i < width - 16 + lookRight; i += 16) + { + internal::prefetch(src + i); + + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6)))); + + l = vld1_u8(src + i + 9); + vst1q_s16(dst + i + 8, vreinterpretq_s16_u16(vsubl_u8(l, vext_u8(l2, l, 6)))); + } + + if (i < width - 8 + lookRight) + { + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6)))); + i += 8; + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + tail2 = vset_lane_u8(borderValue, tail2, 7); + + int16x8_t ls = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0)); + + vst1q_s16(dst + (width - 8), ls); + } + } +}; + +struct RowFilter3x3S16_121 : public RowFilter3x3S16Base +{ + typedef s16 dstType; + + inline RowFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): + RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(borderValue << 2) {} + + const dstType borderFilter; + + inline void operator()(const u8* src, s16* dst, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + l = vset_lane_u8(borderValue, l, 6); + + ptrdiff_t i = 0; + for (; i < width - 16 + lookRight; i += 16) + { + internal::prefetch(src + i); + + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); + + l = vld1_u8(src + i + 9); + vst1q_s16(dst + i + 8, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1)))); + } + + if (i < width - 8 + lookRight) + { + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); + i += 8; + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + tail2 = vset_lane_u8(borderValue, tail2, 7); + uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); + + int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2)); + int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1)); + + int16x8_t ls = vqaddq_s16(tail02, tail1x2); + + vst1q_s16(dst + (width - 8), ls); + } + } +}; + +struct RowFilter3x3S16_1m21 : public RowFilter3x3S16Base +{ + typedef s16 dstType; + + inline RowFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): + RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {} + + const dstType borderFilter; + + inline void operator()(const u8* src, s16* dst, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + l = vset_lane_u8(borderValue, l, 6); + + ptrdiff_t i = 0; + for (; i < width - 16 + lookRight; i += 16) + { + internal::prefetch(src + i); + + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); + + l = vld1_u8(src + i + 9); + vst1q_s16(dst + i + 8, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1)))); + } + + if (i < width - 8 + lookRight) + { + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); + i += 8; + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + tail2 = vset_lane_u8(borderValue, tail2, 7); + uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); + + int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2)); + int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1)); + + int16x8_t ls = vqsubq_s16(tail02, tail1x2); + + vst1q_s16(dst + (width - 8), ls); + } + } +}; + +struct ColFilter3x3S16Generic : public ColFilter3x3S16Base +{ + typedef s16 dstType; + + inline ColFilter3x3S16Generic(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *w): + ColFilter3x3S16Base(_borderType, _borderValue) + { + vw0 = vdupq_n_s16(w[0]); + vw1 = vdupq_n_s16(w[1]); + vw2 = vdupq_n_s16(w[2]); + } + + int16x8_t vw0; + int16x8_t vw1; + int16x8_t vw2; + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); + vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); + + line1 = vld1q_s16(src1 + j + 8); + line2 = vld1q_s16(src2 + j + 8); + vst1q_s16(dst0 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0), line1, vw1), line2, vw2)); + vst1q_s16(dst1 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j + 8), vw2), line1, vw0), line2, vw1)); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); + vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); + vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); + } + } + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + int16x8_t vwl1 = vw0; + int16x8_t vwl2 = vw2; + if (src2 == 0) + { + src2 = src0; + vwl1 = vw2; + vwl2 = vw0; + } + + int16x8_t v_border = vdupq_n_s16(0); + if (borderType == BORDER_MODE_CONSTANT) + { + v_border = vmulq_s16(vdupq_n_s16(borderValue), vwl1); + vwl1 = vw1; + } + else if (borderType == BORDER_MODE_REFLECT101) + { + vwl1 = vw1; + vwl2 = vaddq_s16(vw0, vw2); + } + else //replicate\reflect + vwl1 = vaddq_s16(vwl1, vw1); + + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), + vmulq_s16(vld1q_s16(src2 + j), vwl2))); + vst1q_s16(dst + j + 8, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j + 8), vwl1), + vmulq_s16(vld1q_s16(src2 + j + 8), vwl2))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), + vmulq_s16(vld1q_s16(src2 + j), vwl2))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), + vmulq_s16(vld1q_s16(src2 + j), vwl2))); + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), + vld1q_s16(src1 + j), vw1), + vld1q_s16(src2 + j), vw2)); + vst1q_s16(dst + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0), + vld1q_s16(src1 + j + 8), vw1), + vld1q_s16(src2 + j + 8), vw2)); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), + vld1q_s16(src1 + j), vw1), + vld1q_s16(src2 + j), vw2)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), + vld1q_s16(src1 + j), vw1), + vld1q_s16(src2 + j), vw2)); + } + } + } +}; + +struct ColFilter3x3S16_m101 : public ColFilter3x3S16Base +{ + typedef s16 dstType; + + inline ColFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *): + ColFilter3x3S16Base(_borderType, _borderValue) {} + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); + vst1q_s16(dst0 + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); + vst1q_s16(dst1 + j + 8, vqsubq_s16(vld1q_s16(src3 + j + 8), vld1q_s16(src1 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); + } + } + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + if (borderType == BORDER_MODE_CONSTANT) + { + int16x8_t v_border = vdupq_n_s16(borderValue); + if (src0 == 0) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); + vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), v_border)); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); + vst1q_s16(dst + j + 8, vqsubq_s16(v_border, vld1q_s16(src0 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); + } + } + } + else if (borderType == BORDER_MODE_REFLECT101) + { + int16x8_t vzero = vmovq_n_s16(0); + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vzero); + vst1q_s16(dst + j + 8, vzero); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vzero); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vzero); + } + } + else //replicate\reflect + { + if (src0 == 0) src0 = src1; else src2 = src1; + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + } + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + } + } + } +}; + +struct ColFilter3x3S16_121 : public ColFilter3x3S16Base +{ + typedef s16 dstType; + + inline ColFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): + ColFilter3x3S16Base(_borderType, _borderValue) {} + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + //int16x8_t line0 = vld1q_s16(src0 + j);//1 + //int16x8_t line1 = vld1q_s16(src1 + j);//11 + //int16x8_t line2 = vld1q_s16(src2 + j);// 11 + //int16x8_t line3 = vld1q_s16(src3 + j);// 1 + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqaddq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); + + line1 = vld1q_s16(src1 + j + 8); + line2 = vld1q_s16(src2 + j + 8); + + l12 = vqaddq_s16(line1, line2); + + vst1q_s16(dst0 + j + 8, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), line1), l12)); + vst1q_s16(dst1 + j + 8, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j + 8)))); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqaddq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqaddq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); + } + } + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + if (src2 == 0) + src2 = src0; + + if (borderType == BORDER_MODE_CONSTANT) + { + int16x8_t v_border = vdupq_n_s16(borderValue); + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(v_border, vld1q_s16(src2 + j)))); + vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1), + vqaddq_s16(v_border, vld1q_s16(src2 + j + 8)))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(v_border, vld1q_s16(src2 + j)))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(v_border, vld1q_s16(src2 + j)))); + } + } + else if (borderType == BORDER_MODE_REFLECT101) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1)); + vst1q_s16(dst + j + 8, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j + 8), + vld1q_s16(src2 + j + 8)), 1)); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1)); + } + } + else //replicate\reflect + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), + vqaddq_s16(line1, vld1q_s16(src2 + j)))); + + line1 = vld1q_s16(src1 + j + 8); + vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(line1, 1), + vqaddq_s16(line1, vld1q_s16(src2 + j + 8)))); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), + vqaddq_s16(line1, vld1q_s16(src2 + j)))); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), + vqaddq_s16(line1, vld1q_s16(src2 + j)))); + } + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); + + vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1), + vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8)))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); + } + } + } +}; + +struct ColFilter3x3U8_121 : public ColFilter3x3S16Base +{ + typedef u8 dstType; + + inline ColFilter3x3U8_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): + ColFilter3x3S16Base(_borderType, _borderValue) {} + + inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, const srcType* src3, dstType* dst0, dstType* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + //int16x8_t line0 = vld1q_s16(src0 + j);//1 + //int16x8_t line1 = vld1q_s16(src1 + j);//11 + //int16x8_t line2 = vld1q_s16(src2 + j);// 11 + //int16x8_t line3 = vld1q_s16(src3 + j);// 1 + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vaddq_s16(line1, line2); + + vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); + vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); + + line1 = vld1q_s16(src1 + j + 8); + line2 = vld1q_s16(src2 + j + 8); + + l12 = vaddq_s16(line1, line2); + + vst1_u8(dst0 + j + 8, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j + 8), line1), l12), 4)); + vst1_u8(dst1 + j + 8, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j + 8))), 4)); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vaddq_s16(line1, line2); + + vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); + vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vaddq_s16(line1, line2); + + vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); + vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); + } + } + + inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, dstType* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + if (src2 == 0) + src2 = src0; + + if (borderType == BORDER_MODE_CONSTANT) + { + ptrdiff_t j = 0; + int16x8_t v_border = vdupq_n_s16(borderValue); + for (; j <= width - 16; j += 16) + { + //Store normalized result, essential for gaussianBlur + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); + + vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1), + vaddq_s16(v_border, vld1q_s16(src2 + j + 8))), 4)); + } + if (j <= width - 8) + { + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); + } + } + else if (borderType == BORDER_MODE_REFLECT101) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1), 4)); + vst1_u8(dst + j + 8, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j + 8), + vld1q_s16(src2 + j + 8)), 1), 4)); + } + if (j <= width - 8) + { + vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1), 4)); + } + } + else //replicate\reflect + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), + vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); + + line1 = vld1q_s16(src1 + j + 8); + vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), + vaddq_s16(line1, vld1q_s16(src2 + j + 8))), 4)); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), + vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), + vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); + } + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); + vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1), + vaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8))), 4)); + } + if (j <= width - 8) + { + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); + } + } + } +}; + +struct ColFilter3x3S16_1m21 : public ColFilter3x3S16Base +{ + typedef s16 dstType; + + inline ColFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): + ColFilter3x3S16Base(_borderType, _borderValue) {} + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + //int16x8_t line0 = vld1q_s16(src0 + j);// 1 + //int16x8_t line1 = vld1q_s16(src1 + j);//-1 1 + //int16x8_t line2 = vld1q_s16(src2 + j);// -1 -1 + //int16x8_t line3 = vld1q_s16(src3 + j);// 1 + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqsubq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); + + line1 = vld1q_s16(src1 + j + 8); + line2 = vld1q_s16(src2 + j + 8); + + l12 = vqsubq_s16(line1, line2); + + vst1q_s16(dst0 + j + 8, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j + 8), line1), l12)); + vst1q_s16(dst1 + j + 8, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j + 8), line2), l12)); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqsubq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqsubq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); + } + } + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + if (src2 == 0) + src2 = src0; + + if (borderType == BORDER_MODE_CONSTANT) + { + ptrdiff_t j = 0; + int16x8_t v_border = vdupq_n_s16(borderValue); + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); + vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j + 8)), vshlq_n_s16(vld1q_s16(src1 + j + 8), 1))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); + } + } + else if (borderType == BORDER_MODE_REFLECT101) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); + vst1q_s16(dst + j + 8, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8)), 1)); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); + } + } + else //replicate\reflect + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); + vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); + } + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), + vqshlq_n_s16(vld1q_s16(src1 + j), 1))); + vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8)), + vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), + vqshlq_n_s16(vld1q_s16(src1 + j), 1))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), + vqshlq_n_s16(vld1q_s16(src1 + j), 1))); + } + } + } +}; + +template struct sepFilter3x3 +{ + typedef typename RowFilter::srcType srcType; + typedef typename RowFilter::dstType tmpType; + typedef typename ColFilter::dstType dstType; + + static void process(const Size2D &ssize, + const srcType * srcBase, ptrdiff_t srcStride, + dstType * dstBase, ptrdiff_t dstStride, + const s16 *xw, const s16 *yw, + BORDER_MODE borderType, srcType borderValue, Margin borderMargin) + { + const ptrdiff_t offsetk = 1; + ptrdiff_t borderxl, borderxr, borderyt, borderyb; + borderxl = std::max(0, offsetk - (ptrdiff_t)borderMargin.left); + borderyt = std::max(0, offsetk - (ptrdiff_t)borderMargin.top); + borderxr = std::max(0, offsetk - (ptrdiff_t)borderMargin.right); + borderyb = std::max(0, offsetk - (ptrdiff_t)borderMargin.bottom); + + std::vector _buf(ssize.width << 2); + tmpType * buf = &_buf[0]; + + RowFilter filterX(borderType, borderValue, borderxl, borderxr, xw); + ColFilter filterY(borderType, filterX.borderFilter, yw); + const ptrdiff_t lookTop = offsetk - borderyt; + const ptrdiff_t lookBottom = offsetk - borderyb; + + const srcType* src = srcBase - lookTop * srcStride / sizeof(srcType); + dstType* dst = dstBase; + + ptrdiff_t ridx = -lookTop; + for (; ridx <= (ptrdiff_t)ssize.height + lookBottom - 2; ridx += 2) + { + for (ptrdiff_t bidx = 0; bidx < 2; ++bidx, src += srcStride / sizeof(srcType)) + filterX(src, buf + ssize.width * ((4 + ridx + bidx) % 4), ssize.width); + + if (ridx <= 0) + { + if (ridx == 0) //first row + { + filterY(0, buf + ssize.width * ((ridx + 4) % 4), buf + ssize.width * ((ridx + 1) % 4), dst, ssize.width); + dst += dstStride / sizeof(dstType); + } + continue; + } + + filterY(buf + ssize.width * ((ridx + 2) % 4), + buf + ssize.width * ((ridx + 3) % 4), + buf + ssize.width * ((ridx + 4) % 4), + buf + ssize.width * ((ridx + 1) % 4), + dst, dst + dstStride / sizeof(dstType), ssize.width); + + dst += dstStride * 2 / sizeof(dstType); + } + + if (ridx < (ptrdiff_t)ssize.height + lookBottom) + { + filterX(src, buf + ssize.width * ((4 + ridx) % 4), ssize.width); + filterY(buf + ssize.width * ((2 + ridx) % 4), + buf + ssize.width * ((3 + ridx) % 4), + buf + ssize.width * ((4 + ridx) % 4), dst, ssize.width); + dst += dstStride / sizeof(dstType); + ridx++; + } + if (lookBottom == 0) + filterY(buf + ssize.width * ((ridx + 2) % 4), buf + ssize.width * ((ridx + 3) % 4), 0, dst, ssize.width); + } +}; + +} //namespace internal + +} //namespace CAROTENE_NS + +#endif // CAROTENE_NEON + +#endif // CAROTENE_SRC_REMAP_HPP diff --git a/3rdparty/carotene/src/sobel.cpp b/3rdparty/carotene/src/sobel.cpp new file mode 100644 index 0000000000..5d46045d9f --- /dev/null +++ b/3rdparty/carotene/src/sobel.cpp @@ -0,0 +1,317 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" + +namespace CAROTENE_NS { + +bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border, + s32 dx, s32 dy, Margin borderMargin) +{ + return dx < 3 && dx >= 0 && + dy < 3 && dy >= 0 && + (dx + dy) > 0 && + isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin); +} + +void Sobel3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE borderType, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin)); +#ifdef CAROTENE_NEON + SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride, + dx, dy, 0, 0, + borderType, borderValue, borderMargin); +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border, + s32 dx, s32 dy) +{ + return isSupportedConfiguration() && + dx < 3 && dx >= 0 && + dy < 3 && dy >= 0 && + (dx + dy) > 0 && + size.width >= 4 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REPLICATE ); +} + +void Sobel3x3(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE borderType, f32 borderValue) +{ + internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy)); +#ifdef CAROTENE_NEON + std::vector _tmp; + f32 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(size.width + 2, borderValue); + tmp = &_tmp[1]; + } + + ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size + std::vector _tempBuf((delta << 1) + 64); + f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32); + + for( size_t y = 0; y < size.height; y++ ) + { + const f32* srow0; + const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y); + const f32* srow2; + f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0); + f32* drow1 = internal::getRowPtr(dstBase, dstStride, y); + if (borderType == BORDER_MODE_REFLECT101) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2); + } else if (borderType == BORDER_MODE_CONSTANT) { + srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + } + + float32x4_t tprev = vmovq_n_f32(0.f); + float32x4_t tcurr = vmovq_n_f32(0.f); + float32x4_t tnext = vmovq_n_f32(0.f); + float32x4_t t0, t1, t2; + // do vertical convolution + size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4); + for( ; x <= bcolsn; x += 4 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + float32x4_t x0 = vld1q_f32(srow0 + x); + float32x4_t x1 = vld1q_f32(srow1 + x); + float32x4_t x2 = vld1q_f32(srow2 + x); + + tprev = tcurr; + tcurr = tnext; + if(!dy) + { + tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0); + } + else if(dy == 2) + { + tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0)); + } + else + { + tnext = vsubq_f32(x2, x0); + } + + if(!x) { + tcurr = tnext; + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_f32(borderValue,tcurr, 3); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3); + } + else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE + { + tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3); + } + continue; + } + + internal::prefetch(trow0 + x); + internal::prefetch(trow1 + x); + + t0 = vextq_f32(tprev, tcurr, 3); + t1 = tcurr; + t2 = vextq_f32(tcurr, tnext, 1); + if(!dx) + { + t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2)); + } + else if(dx == 2) + { + t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0)); + } + else + { + t0 = vsubq_f32(t2, t0); + } + + if(!(y%2)) + { + vst1q_f32(trow0 + x - 4, t0); + } + else + { + vst1q_f32(trow1 + x - 4, t0); + } + } + x -= 4; + if(x == size.width){ + x--; + } + f32 prevx = 0, rowx = 0, nextx = 0; + if(!dy) + { + prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] : + (borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] : + (borderType == BORDER_MODE_CONSTANT ? 4*borderValue : + srow2[0] + 2*srow1[0] + srow0[0]) ); + rowx = srow2[x] + 2*srow1[x] + srow0[x]; + } + else if(dy == 2) + { + prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] : + (borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] : + (borderType == BORDER_MODE_CONSTANT ? 0.f : + srow2[0] - 2*srow1[0] + srow0[0]) ); + rowx = srow2[x] - 2*srow1[x] + srow0[x]; + } + else + { + prevx = x > 0 ? srow2[x-1] - srow0[x-1] : + (borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] : + (borderType == BORDER_MODE_CONSTANT ? 0.f : + srow2[0] - srow0[0]) ); + rowx = srow2[x] - srow0[x]; + } + + for( ; x < size.width; x++ ) + { + if(x+1 == size.width) { + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + if(!dy) { + nextx = 4*borderValue; + } else { + nextx = 0.f; + } + } else if (borderType == BORDER_MODE_REFLECT101) + { + if(!dy) { + nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1]; + } else if(dy == 2) { + nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1]; + } else { + nextx = srow2[x-1] - srow0[x-1]; + } + } else { + if(!dy) { + nextx = srow2[x] + 2*srow1[x] + srow0[x]; + } else if(dy == 2) { + nextx = srow2[x] - 2*srow1[x] + srow0[x]; + } else { + nextx = srow2[x] - srow0[x]; + } + } + } else { + if(!dy) { + nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1]; + } else if(dy == 2) { + nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1]; + } else { + nextx = srow2[x+1] - srow0[x+1]; + } + } + f32 res; + if(dx==1) { + res = nextx - prevx; + } else if(!dx) { + res = prevx + 2*rowx + nextx; + } else { + res = prevx - 2*rowx + nextx; + } + if(!(y%2)) { + *(trow0+x) = res; + } else { + *(trow1+x) = res; + } + prevx = rowx; + rowx = nextx; + } + + if(y>0) { + for(size_t x1 = 0; x1 < size.width; x1++ ) + { + if(y%2) + *(drow + x1) = trow0[x1]; + else + *(drow + x1) = trow1[x1]; + } + } + if(y == size.height-1) { + for(size_t x1 = 0; x1 < size.width; x1++ ) + { + if(!(y%2)) + *(drow1 + x1) = trow0[x1]; + else + *(drow1 + x1) = trow1[x1]; + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/sub.cpp b/3rdparty/carotene/src/sub.cpp new file mode 100644 index 0000000000..38853895e7 --- /dev/null +++ b/3rdparty/carotene/src/sub.cpp @@ -0,0 +1,621 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +struct SubWrap +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vsubq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vsub(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = (T)((WT)src0[0] - (WT)src1[0]); + } +}; + +template +struct SubSaturate +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vqsubq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vqsub(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = internal::saturate_cast((WT)src0[0] - (WT)src1[0]); + } +}; + +} // namespace + +#endif + +void sub(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16); + uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16); + vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10))); + vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10))); + vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11))); + vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11))); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src0 = vld1_u8(src0 + j); + uint8x8_t v_src1 = vld1_u8(src1 + j); + vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1)); + } + + for (; j < size.width; j++) + dst[j] = (s16)src0[j] - (s16)src1[j]; + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void sub(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + f32 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16); + uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16); + int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00), vget_low_u8(v_src10))); + int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10))); + + vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) ))); + vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) ))); + vst1q_f32(dst + j + 8, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) ))); + vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) ))); + + vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01), vget_low_u8(v_src11))); + vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11))); + + vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) ))); + vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) ))); + vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) ))); + vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) ))); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src0 = vld1_u8(src0 + j); + uint8x8_t v_src1 = vld1_u8(src1 + j); + + int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1)); + vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vs) ))); + vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) ))); + } + for(; j < size.width; j++) + dst[j] = (f32)src0[j] - (f32)src1[j]; + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void sub(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (policy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vqsubq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = internal::saturate_cast((s32)src0[j] - (s32)src1[j]); + } + else + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vsubq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = (s16)((s32)src0[j] - (s32)src1[j]); + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (policy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1))); + int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1))); + int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vld1q_s16(src0 + j); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j))); + int16x8_t v_dst = vqsubq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = internal::saturate_cast((s32)src0[j] - (s32)src1[j]); + } + else + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1))); + int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1))); + int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vld1q_s16(src0 + j); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j))); + int16x8_t v_dst = vsubq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = (s16)((s32)src0[j] - (s32)src1[j]); + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/sum.cpp b/3rdparty/carotene/src/sum.cpp new file mode 100644 index 0000000000..812e7fca67 --- /dev/null +++ b/3rdparty/carotene/src/sum.cpp @@ -0,0 +1,385 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +bool isSumSupported(u32 channels) +{ + return (channels && channels < 5); +} + +void sum(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u32 * sumdst, u32 channels) +{ + internal::assertSupportedConfiguration(isSumSupported(channels)); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + const ptrdiff_t width = size.width * channels; + + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + ptrdiff_t i = 0; + + if (channels == 3) + { + uint32x4_t vs1231 = vdupq_n_u32(0); + uint32x4_t vs3123 = vdupq_n_u32(0); + uint32x4_t vs2312 = vdupq_n_u32(0); + for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3) + { + uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0)); + uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8)); + uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16)); + + for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3) + { + internal::prefetch(src + j + 24); + s1 = vaddw_u8(s1, vld1_u8(src + j + 0)); + s2 = vaddw_u8(s2, vld1_u8(src + j + 8)); + s3 = vaddw_u8(s3, vld1_u8(src + j + 16)); + } + + vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2))); + vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3))); + vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1))); + } + if (i <= width - 8*3) + { + uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0)); + uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8)); + uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16)); + + for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3) + { + internal::prefetch(src + 24); + s1 = vaddw_u8(s1, vld1_u8(src + 0)); + s2 = vaddw_u8(s2, vld1_u8(src + 8)); + s3 = vaddw_u8(s3, vld1_u8(src + 16)); + } + + vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2))); + vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3))); + vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1))); + } + + u32 sum[12]; + vst1q_u32(sum+0, vs1231); + vst1q_u32(sum+4, vs2312); + vst1q_u32(sum+8, vs3123); + + for (; i < width; i += 3, src += 3) + { + sumdst[0] += src[0]; + sumdst[1] += src[1]; + sumdst[2] += src[2]; + } + + sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9]; + sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10]; + sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11]; + } + else + { + uint32x4_t vs = vdupq_n_u32(0); + for (; i <= width - 257*8; i += 257*8, src += 257 * 8) + { + uint16x8_t s1 = vmovl_u8(vld1_u8(src)); + + for (int j = 8; j < 257 * 8; j += 8) + { + internal::prefetch(src + j); + s1 = vaddw_u8(s1, vld1_u8(src + j)); + } + + vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1))); + } + if (i < width - 7) + { + uint16x8_t s1 = vmovl_u8(vld1_u8(src)); + + for(i+=8,src+=8; i < width-7; i+=8,src+=8) + { + internal::prefetch(src); + s1 = vaddw_u8(s1, vld1_u8(src)); + } + vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1))); + } + + if (channels == 1) + { + uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2)); + + u32 s0 = vget_lane_u32(vs1, 0); + for(; i < width; ++i,++src) + s0 += src[0]; + sumdst[0] += s0; + } + else if (channels == 4) + { + vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst))); + + for(; i < width; i+=4,src+=4) + { + sumdst[0] += src[0]; + sumdst[1] += src[1]; + sumdst[2] += src[2]; + sumdst[3] += src[3]; + } + } + else//if (channels == 2) + { + uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst))); + + for(; i < width; i+=2,src+=2) + { + sumdst[0] += src[0]; + sumdst[1] += src[1]; + } + } + }//channels != 3 + } +#else + (void)_size; + (void)srcBase; + (void)srcStride; + (void)sumdst; + (void)channels; +#endif +} + +void sum(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + f64 * sumdst, u32 channels) +{ + internal::assertSupportedConfiguration(isSumSupported(channels)); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + const ptrdiff_t width = size.width * channels; + + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + ptrdiff_t i = 0; + + if (channels == 3) + { + float32x4_t vs1231 = vdupq_n_f32(0); + float32x4_t vs2312 = vdupq_n_f32(0); + float32x4_t vs3123 = vdupq_n_f32(0); + for(; i <= width-12; i += 12) + { + internal::prefetch(src + i + 12); + vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0)); + vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4)); + vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8)); + } + + f32 s[12]; + vst1q_f32(s + 0, vs1231); + vst1q_f32(s + 4, vs2312); + vst1q_f32(s + 8, vs3123); + + sumdst[0] += s[0] + s[3] + s[6] + s[9]; + sumdst[1] += s[1] + s[4] + s[7] + s[10]; + sumdst[2] += s[2] + s[5] + s[8] + s[11]; + for( ; i < width; i+=3) + { + sumdst[0] += src[i]; + sumdst[1] += src[i+1]; + sumdst[2] += src[i+2]; + } + } + else + { + float32x4_t vs = vdupq_n_f32(0); + for(; i <= width-4; i += 4) + { + internal::prefetch(src + i); + vs = vaddq_f32(vs, vld1q_f32(src+i)); + } + + if (channels == 1) + { + float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs)); + f32 s[2]; + vst1_f32(s, vs2); + + sumdst[0] += s[0] + s[1]; + for( ; i < width; i++) + sumdst[0] += src[i]; + } + else if (channels == 4) + { + f32 s[4]; + vst1q_f32(s, vs); + + sumdst[0] += s[0]; + sumdst[1] += s[1]; + sumdst[2] += s[2]; + sumdst[3] += s[3]; + } + else//if (channels == 2) + { + float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs)); + f32 s[2]; + vst1_f32(s, vs2); + + sumdst[0] += s[0]; + sumdst[1] += s[1]; + + if(i < width) + { + sumdst[0] += src[i]; + sumdst[1] += src[i+1]; + } + } + }//channels != 3 + } +#else + (void)_size; + (void)srcBase; + (void)srcStride; + (void)sumdst; + (void)channels; +#endif +} + +bool isSqsumSupported(u32 channels) +{ + return (channels && ((4/channels)*channels == 4)); +} + +void sqsum(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + f64 * sumdst, f64 * sqsumdst, u32 channels) +{ + internal::assertSupportedConfiguration(isSqsumSupported(channels)); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width*channels)) + { + size.width *= size.height; + size.height = 1; + } + const size_t width = size.width * channels; + + size_t blockSize0 = 1 << 23; + size_t roiw8 = width & ~7; + + uint32x4_t v_zero = vdupq_n_u32(0u); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0u; + + while (j < roiw8) + { + size_t blockSize = std::min(roiw8 - j, blockSize0) + j; + uint32x4_t v_sum = v_zero; + uint32x4_t v_sqsum = v_zero; + + for ( ; j < blockSize ; j += 8, src += 8) + { + internal::prefetch(src); + uint8x8_t v_src0 = vld1_u8(src); + + uint16x8_t v_src = vmovl_u8(v_src0); + uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src); + v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi)); + v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo); + v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi); + } + + u32 arsum[8]; + vst1q_u32(arsum, v_sum); + vst1q_u32(arsum + 4, v_sqsum); + + sumdst[0] += (f64)arsum[0]; + sumdst[1 % channels] += (f64)arsum[1]; + sumdst[2 % channels] += (f64)arsum[2]; + sumdst[3 % channels] += (f64)arsum[3]; + sqsumdst[0] += (f64)arsum[4]; + sqsumdst[1 % channels] += (f64)arsum[5]; + sqsumdst[2 % channels] += (f64)arsum[6]; + sqsumdst[3 % channels] += (f64)arsum[7]; + } + // collect a few last elements in the current row + // it's ok to process channels elements per step + // since we could handle 1,2 or 4 channels + // we always have channels-fold amount of elements remaining + for ( ; j < width; j+=channels, src+=channels) + { + for (u32 kk = 0; kk < channels; kk++) + { + u32 srcval = src[kk]; + sumdst[kk] += srcval; + sqsumdst[kk] += srcval * srcval; + } + } + } +#else + (void)_size; + (void)srcBase; + (void)srcStride; + (void)sumdst; + (void)sqsumdst; + (void)channels; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/template_matching.cpp b/3rdparty/carotene/src/template_matching.cpp new file mode 100644 index 0000000000..ad87085188 --- /dev/null +++ b/3rdparty/carotene/src/template_matching.cpp @@ -0,0 +1,241 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include +#include + +namespace CAROTENE_NS { + +#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than + //time for simultaneous 2 lines matching for the same amount of data + +bool isMatchTemplateSupported(const Size2D &tmplSize) +{ + return isSupportedConfiguration() && + tmplSize.width >= 8 && // Actually the function could process even shorter templates + // but there will be no NEON optimization in this case + (tmplSize.width * tmplSize.height) <= 256; +} + +void matchTemplate(const Size2D &srcSize, + const u8 * srcBase, ptrdiff_t srcStride, + const Size2D &tmplSize, + const u8 * tmplBase, ptrdiff_t tmplStride, + f32 * dstBase, ptrdiff_t dstStride, + bool normalize) +{ + internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize)); +#ifdef CAROTENE_NEON + const size_t tmplW = tmplSize.width; + const size_t tmplH = tmplSize.height; + const size_t dstW = srcSize.width - tmplSize.width + 1; + const size_t dstH = srcSize.height - tmplSize.height + 1; + + //template correlation part + { +#if ENABLE4LINESMATCHING + const size_t dstroiw4 = dstW & ~3u; +#endif + const size_t dstroiw2 = dstW & ~1u; + const size_t tmplroiw = tmplW & ~7u; + const size_t dstride = dstStride >> 2; + + f32 *corr = dstBase; + const u8 *imgrrow = srcBase; + for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride) + { + size_t c = 0; +#if ENABLE4LINESMATCHING + for(; c < dstroiw4; c+=4) + { + u32 dot[4] = {0, 0, 0, 0}; + uint32x4_t vdot0 = vmovq_n_u32(0); + uint32x4_t vdot1 = vmovq_n_u32(0); + uint32x4_t vdot2 = vmovq_n_u32(0); + uint32x4_t vdot3 = vmovq_n_u32(0); + + const u8 *img = imgrrow; + const u8 *tmpl = tmplBase; + for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride) + { + size_t j = 0; + for(; j < tmplroiw; j+=8) + { + uint8x8_t vtmpl = vld1_u8(tmpl + j); + + uint8x8_t vimg0 = vld1_u8(img + j + c + 0); + uint8x8_t vimg1 = vld1_u8(img + j + c + 1); + uint8x8_t vimg2 = vld1_u8(img + j + c + 2); + uint8x8_t vimg3 = vld1_u8(img + j + c + 3); + + uint16x8_t vd0 = vmull_u8(vtmpl, vimg0); + uint16x8_t vd1 = vmull_u8(vtmpl, vimg1); + uint16x8_t vd2 = vmull_u8(vtmpl, vimg2); + uint16x8_t vd3 = vmull_u8(vtmpl, vimg3); + + vdot0 = vpadalq_u16(vdot0, vd0); + vdot1 = vpadalq_u16(vdot1, vd1); + vdot2 = vpadalq_u16(vdot2, vd2); + vdot3 = vpadalq_u16(vdot3, vd3); + } + for(; j < tmplW; ++j) + { + dot[0] += tmpl[j] * img[j + c + 0]; + dot[1] += tmpl[j] * img[j + c + 1]; + dot[2] += tmpl[j] * img[j + c + 2]; + dot[3] += tmpl[j] * img[j + c + 3]; + } + } + uint32x4_t vdotx = vld1q_u32(dot); + uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0)); + uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1)); + uint32x2_t vdot_2 = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2)); + uint32x2_t vdot_3 = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3)); + uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1); + uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3); + + vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23)))); + } +#endif + + for(; c < dstroiw2; c+=2) + { + u32 dot[2] = {0, 0}; + uint32x4_t vdot0 = vmovq_n_u32(0); + uint32x4_t vdot1 = vmovq_n_u32(0); + const u8 *img = imgrrow; + const u8 *tmpl = tmplBase; + for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride) + { + size_t j = 0; + for(; j < tmplroiw; j+=8) + { + uint8x8_t vtmpl = vld1_u8(tmpl + j); + + uint8x8_t vimg0 = vld1_u8(img + j + c + 0); + uint8x8_t vimg1 = vld1_u8(img + j + c + 1); + + uint16x8_t vd0 = vmull_u8(vtmpl, vimg0); + uint16x8_t vd1 = vmull_u8(vtmpl, vimg1); + + vdot0 = vpadalq_u16(vdot0, vd0); + vdot1 = vpadalq_u16(vdot1, vd1); + } + for(; j < tmplW; ++j) + { + dot[0] += tmpl[j] * img[j + c + 0]; + dot[1] += tmpl[j] * img[j + c + 1]; + } + } + uint32x2_t vdotx = vld1_u32(dot); + uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0)); + uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1)); + uint32x2_t vdot_ = vpadd_u32(vdot_0, vdot_1); + vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_))); + } + + for(; c < dstW; ++c) + { + u32 dot = 0; + uint32x4_t vdot = vmovq_n_u32(0); + const u8 *img = imgrrow; + const u8 *tmpl = tmplBase; + for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride) + { + size_t j = 0; + for(; j < tmplroiw; j+=8) + { + uint8x8_t vtmpl = vld1_u8(tmpl + j); + uint8x8_t vimg = vld1_u8(img + j + c); + uint16x8_t vd = vmull_u8(vtmpl, vimg); + vdot = vpadalq_u16(vdot, vd); + } + for(; j < tmplW; ++j) + dot += tmpl[j] * img[j + c]; + } + u32 wdot[2]; + vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot))); + dot += wdot[0] + wdot[1]; + corr[c] = (f32)dot; + } + } + } + + if(normalize) + { + f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride)); + + size_t iw = srcSize.width+1; + size_t ih = srcSize.height+1; + std::vector _sqsum(iw*ih); + f64 *sqsum = &_sqsum[0]; + memset(sqsum, 0, iw*sizeof(f64)); + for(size_t i = 1; i < ih; ++i) + sqsum[iw*i] = 0.; + sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64)); + + for(size_t i = 0; i < dstH; ++i) + { + f32 *result = internal::getRowPtr(dstBase, dstStride, i); + for(size_t j = 0; j < dstW; ++j) + { + double s2 = sqsum[iw*i + j] + + sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] - + sqsum[iw*(i + tmplSize.height) + j] - + sqsum[iw*i + j + tmplSize.width]; + + result[j] /= tn * std::sqrt(s2); + } + } + } +#else + (void)srcSize; + (void)srcBase; + (void)srcStride; + (void)tmplBase; + (void)tmplStride; + (void)dstBase; + (void)dstStride; + (void)normalize; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/threshold.cpp b/3rdparty/carotene/src/threshold.cpp new file mode 100644 index 0000000000..8e03798b02 --- /dev/null +++ b/3rdparty/carotene/src/threshold.cpp @@ -0,0 +1,1627 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +void thresholdBinary(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 trueValue, u8 falseValue) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + if(trueValue == 255 && falseValue == 0) + { + for (size_t i = 0; i < size.height; ++i) { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + vst1q_u8(dst + j, r0); + vst1q_u8(dst + j + 16, r1); + } + for (; j < roiw8; j += 8) { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + vst1_u8(dst + j, r0); + } + + for (; j < size.width; j++) { + *(dst + j) = *(src + j) > threshold ? 255 : 0; + } + } + } + else + { + uint8x16_t vtrue_value = vdupq_n_u8(trueValue); + uint8x8_t vtrue_value8 = vdup_n_u8(trueValue); + uint8x16_t vfalse_value = vdupq_n_u8(falseValue); + uint8x8_t vfalse_value8 = vdup_n_u8(falseValue); + + for (size_t i = 0; i < size.height; ++i) { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + uint8x16_t r0a = vbslq_u8(r0, vtrue_value, vfalse_value); + uint8x16_t r1a = vbslq_u8(r1, vtrue_value, vfalse_value); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + uint8x8_t r0a = vbsl_u8(r0, vtrue_value8, vfalse_value8); + vst1_u8(dst + j, r0a); + } + + for (; j < size.width; j++) { + *(dst + j) = *(src + j) > threshold ? trueValue : falseValue; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)trueValue; + (void)falseValue; +#endif +} + +void thresholdRange(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 lowerThreshold, u8 upperThreshold, + u8 trueValue, u8 falseValue) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + uint8x16_t v_lower = vdupq_n_u8(lowerThreshold), v_upper = vdupq_n_u8(upperThreshold); + uint8x8_t v_lower8 = vdup_n_u8(lowerThreshold), v_upper8 = vdup_n_u8(upperThreshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + if(trueValue == 255 && falseValue == 0) + { + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16); + uint8x16_t v_dst0 = vandq_u8(vcgeq_u8(v_src0, v_lower), vcleq_u8(v_src0, v_upper)); + uint8x16_t v_dst1 = vandq_u8(vcgeq_u8(v_src1, v_lower), vcleq_u8(v_src1, v_upper)); + vst1q_u8(dst + j, v_dst0); + vst1q_u8(dst + j + 16, v_dst1); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + uint8x8_t v_dst = vand_u8(vcge_u8(v_src, v_lower8), vcle_u8(v_src, v_upper8)); + vst1_u8(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + u8 srcVal = src[j]; + dst[j] = lowerThreshold <= srcVal && srcVal <= upperThreshold ? 255 : 0; + } + } + } + else + { + uint8x16_t vtrue_value = vdupq_n_u8(trueValue); + uint8x8_t vtrue_value8 = vdup_n_u8(trueValue); + uint8x16_t vfalse_value = vdupq_n_u8(falseValue); + uint8x8_t vfalse_value8 = vdup_n_u8(falseValue); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16); + uint8x16_t v_dst0 = vandq_u8(vcgeq_u8(v_src0, v_lower), vcleq_u8(v_src0, v_upper)); + uint8x16_t v_dst1 = vandq_u8(vcgeq_u8(v_src1, v_lower), vcleq_u8(v_src1, v_upper)); + v_dst0 = vbslq_u8(v_dst0, vtrue_value, vfalse_value); + v_dst1 = vbslq_u8(v_dst1, vtrue_value, vfalse_value); + vst1q_u8(dst + j, v_dst0); + vst1q_u8(dst + j + 16, v_dst1); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + uint8x8_t v_dst = vand_u8(vcge_u8(v_src, v_lower8), vcle_u8(v_src, v_upper8)); + v_dst = vbsl_u8(v_dst, vtrue_value8, vfalse_value8); + vst1_u8(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + u8 srcVal = src[j]; + dst[j] = lowerThreshold <= srcVal && srcVal <= upperThreshold ? trueValue : falseValue; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)lowerThreshold; + (void)upperThreshold; + (void)trueValue; + (void)falseValue; +#endif +} + +void thresholdBinary(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x16_t vvalue = vdupq_n_u8(value); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + uint8x8_t vvalue8 = vdup_n_u8(value); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + uint8x16_t r0a = vandq_u8(r0, vvalue); + uint8x16_t r1a = vandq_u8(r1, vvalue); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + uint8x8_t r0a = vand_u8(r0, vvalue8); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x16_t vvalue = vdupq_n_u8(value); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + uint8x8_t vvalue8 = vdup_n_u8(value); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcleq_u8(v0, vthreshold); + uint8x16_t r1 = vcleq_u8(v1, vthreshold); + uint8x16_t r0a = vandq_u8(r0, vvalue); + uint8x16_t r1a = vandq_u8(r1, vvalue); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcle_u8(v0, vthreshold8); + uint8x8_t r0a = vand_u8(r0, vvalue8); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vqsubq_u8(v0, vthreshold); + uint8x16_t r1 = vqsubq_u8(v1, vthreshold); + uint8x16_t r0a = vqsubq_u8(v0, r0); + uint8x16_t r1a = vqsubq_u8(v1, r1); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vqsub_u8(v0, vthreshold8); + uint8x8_t r0a = vqsub_u8(v0, r0); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + uint8x16_t r0a = vandq_u8(v0, r0); + uint8x16_t r1a = vandq_u8(v1, r1); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + uint8x8_t r0a = vand_u8(v0, r0); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + uint8x16_t r0a = vbicq_u8(v0, r0); + uint8x16_t r1a = vbicq_u8(v1, r1); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + uint8x8_t r0a = vbic_u8(v0, r0); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold, s8 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x16_t vvalue = vdupq_n_s8(value); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + int8x8_t vvalue8 = vdup_n_s8(value); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold)); + int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold)); + int8x16_t r0a = vandq_s8(r0, vvalue); + int8x16_t r1a = vandq_s8(r1, vvalue); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8)); + int8x8_t r0a = vand_s8(r0, vvalue8); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold, s8 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x16_t vvalue = vdupq_n_s8(value); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + int8x8_t vvalue8 = vdup_n_s8(value); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vreinterpretq_s8_u8(vcleq_s8(v0, vthreshold)); + int8x16_t r1 = vreinterpretq_s8_u8(vcleq_s8(v1, vthreshold)); + int8x16_t r0a = vandq_s8(r0, vvalue); + int8x16_t r1a = vandq_s8(r1, vvalue); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vreinterpret_s8_u8(vcle_s8(v0, vthreshold8)); + int8x8_t r0a = vand_s8(r0, vvalue8); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vqsubq_s8(v0, vthreshold); + int8x16_t r1 = vqsubq_s8(v1, vthreshold); + int8x16_t r0a = vqsubq_s8(v0, r0); + int8x16_t r1a = vqsubq_s8(v1, r1); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vqsub_s8(v0, vthreshold8); + int8x8_t r0a = vqsub_s8(v0, r0); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold)); + int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold)); + int8x16_t r0a = vandq_s8(v0, r0); + int8x16_t r1a = vandq_s8(v1, r1); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8)); + int8x8_t r0a = vand_s8(v0, r0); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold)); + int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold)); + int8x16_t r0a = vbicq_s8(v0, r0); + int8x16_t r1a = vbicq_s8(v1, r1); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8)); + int8x8_t r0a = vbic_s8(v0, r0); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold, s16 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + int16x8_t vvalue16 = vdupq_n_s16(value); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + uint16x8_t r0 = vcgtq_s16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_s16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(r0, vreinterpretq_u16_s16(vvalue16)); + uint16x8_t r1a = vandq_u16(r1, vreinterpretq_u16_s16(vvalue16)); + vst1q_u16((u16*)dst + j, r0a); + vst1q_u16((u16*)dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold, s16 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + int16x8_t vvalue16 = vdupq_n_s16(value); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + uint16x8_t r0 = vcleq_s16(v0, vthreshold16); + uint16x8_t r1 = vcleq_s16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(r0, vreinterpretq_u16_s16(vvalue16)); + uint16x8_t r1a = vandq_u16(r1, vreinterpretq_u16_s16(vvalue16)); + vst1q_s16(dst + j, vreinterpretq_s16_u16(r0a)); + vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(r1a)); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + int16x8_t r0 = vminq_s16(v0, vthreshold16); + int16x8_t r1 = vminq_s16(v1, vthreshold16); + vst1q_s16(dst + j, r0); + vst1q_s16(dst + j + 8, r1); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + uint16x8_t r0 = vcgtq_s16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_s16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(vreinterpretq_u16_s16(v0), r0); + uint16x8_t r1a = vandq_u16(vreinterpretq_u16_s16(v1), r1); + vst1q_u16((u16*)dst + j, r0a); + vst1q_u16((u16*)dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + uint16x8_t r0 = vcgtq_s16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_s16(v1, vthreshold16); + uint16x8_t r0a = vbicq_u16(vreinterpretq_u16_s16(v0), r0); + uint16x8_t r1a = vbicq_u16(vreinterpretq_u16_s16(v1), r1); + vst1q_u16((u16*)dst + j, r0a); + vst1q_u16((u16*)dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold, u16 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + uint16x8_t vvalue16 = vdupq_n_u16(value); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vcgtq_u16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_u16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(r0, vvalue16); + uint16x8_t r1a = vandq_u16(r1, vvalue16); + vst1q_u16(dst + j, r0a); + vst1q_u16(dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold, u16 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + uint16x8_t vvalue16 = vdupq_n_u16(value); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vcleq_u16(v0, vthreshold16); + uint16x8_t r1 = vcleq_u16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(r0, vvalue16); + uint16x8_t r1a = vandq_u16(r1, vvalue16); + vst1q_u16(dst + j, r0a); + vst1q_u16(dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vminq_u16(v0, vthreshold16); + uint16x8_t r1 = vminq_u16(v1, vthreshold16); + vst1q_u16(dst + j, r0); + vst1q_u16(dst + j + 8, r1); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vcgtq_u16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_u16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(v0, r0); + uint16x8_t r1a = vandq_u16(v1, r1); + vst1q_u16(dst + j, r0a); + vst1q_u16(dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vcgtq_u16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_u16(v1, vthreshold16); + uint16x8_t r0a = vbicq_u16(v0, r0); + uint16x8_t r1a = vbicq_u16(v1, r1); + vst1q_u16(dst + j, r0a); + vst1q_u16(dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold, s32 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + int32x4_t vvalue8 = vdupq_n_s32(value); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + uint32x4_t r0 = vcgtq_s32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_s32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_s32(vvalue8)); + uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_s32(vvalue8)); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold, s32 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + int32x4_t vvalue8 = vdupq_n_s32(value); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + uint32x4_t r0 = vcleq_s32(v0, vthreshold8); + uint32x4_t r1 = vcleq_s32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_s32(vvalue8)); + uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_s32(vvalue8)); + vst1q_s32(dst + j, vreinterpretq_s32_u32(r0a)); + vst1q_s32(dst + j + 4, vreinterpretq_s32_u32(r1a)); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + int32x4_t r0 = vminq_s32(v0, vthreshold8); + int32x4_t r1 = vminq_s32(v1, vthreshold8); + vst1q_s32(dst + j, r0); + vst1q_s32(dst + j + 4, r1); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + uint32x4_t r0 = vcgtq_s32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_s32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(vreinterpretq_u32_s32(v0), r0); + uint32x4_t r1a = vandq_u32(vreinterpretq_u32_s32(v1), r1); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + uint32x4_t r0 = vcgtq_s32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_s32(v1, vthreshold8); + uint32x4_t r0a = vbicq_u32(vreinterpretq_u32_s32(v0), r0); + uint32x4_t r1a = vbicq_u32(vreinterpretq_u32_s32(v1), r1); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold, f32 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + float32x4_t vvalue8 = vdupq_n_f32(value); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + uint32x4_t r0 = vcgtq_f32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_f32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_f32(vvalue8)); + uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_f32(vvalue8)); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold, f32 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + float32x4_t vvalue8 = vdupq_n_f32(value); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + uint32x4_t r0 = vcleq_f32(v0, vthreshold8); + uint32x4_t r1 = vcleq_f32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_f32(vvalue8)); + uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_f32(vvalue8)); + vst1q_f32(dst + j, vreinterpretq_f32_u32(r0a)); + vst1q_f32(dst + j + 4, vreinterpretq_f32_u32(r1a)); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + float32x4_t r0 = vminq_f32(v0, vthreshold8); + float32x4_t r1 = vminq_f32(v1, vthreshold8); + vst1q_f32(dst + j, r0); + vst1q_f32(dst + j + 4, r1); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + uint32x4_t r0 = vcgtq_f32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_f32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(vreinterpretq_u32_f32(v0), r0); + uint32x4_t r1a = vandq_u32(vreinterpretq_u32_f32(v1), r1); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + uint32x4_t r0 = vcgtq_f32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_f32(v1, vthreshold8); + uint32x4_t r0a = vbicq_u32(vreinterpretq_u32_f32(v0), r0); + uint32x4_t r1a = vbicq_u32(vreinterpretq_u32_f32(v1), r1); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/vtransform.hpp b/3rdparty/carotene/src/vtransform.hpp new file mode 100644 index 0000000000..08841a2263 --- /dev/null +++ b/3rdparty/carotene/src/vtransform.hpp @@ -0,0 +1,689 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_VTRANSFORM_HPP +#define CAROTENE_SRC_VTRANSFORM_HPP + +#include "common.hpp" + +#include + +#ifdef CAROTENE_NEON + +namespace CAROTENE_NS { namespace internal { + +////////////////////////////// Type Traits /////////////////////// + +template +struct VecTraits; + +template <> struct VecTraits< u8, 1> { typedef uint8x16_t vec128; typedef uint8x8_t vec64; typedef VecTraits< u8, 1> unsign; }; +template <> struct VecTraits< s8, 1> { typedef int8x16_t vec128; typedef int8x8_t vec64; typedef VecTraits< u8, 1> unsign; }; +template <> struct VecTraits { typedef uint16x8_t vec128; typedef uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; }; +template <> struct VecTraits { typedef int16x8_t vec128; typedef int16x4_t vec64; typedef VecTraits< u16, 1> unsign; }; +template <> struct VecTraits { typedef int32x4_t vec128; typedef int32x2_t vec64; typedef VecTraits< u32, 1> unsign; }; +template <> struct VecTraits { typedef uint32x4_t vec128; typedef uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; }; +template <> struct VecTraits { typedef int64x2_t vec128; typedef int64x1_t vec64; typedef VecTraits< u64, 1> unsign; }; +template <> struct VecTraits { typedef uint64x2_t vec128; typedef uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; }; +template <> struct VecTraits { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; }; + +template <> struct VecTraits< u8, 2> { typedef uint8x16x2_t vec128; typedef uint8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; }; +template <> struct VecTraits< s8, 2> { typedef int8x16x2_t vec128; typedef int8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; }; +template <> struct VecTraits { typedef uint16x8x2_t vec128; typedef uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; }; +template <> struct VecTraits { typedef int16x8x2_t vec128; typedef int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; }; +template <> struct VecTraits { typedef int32x4x2_t vec128; typedef int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; }; +template <> struct VecTraits { typedef uint32x4x2_t vec128; typedef uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; }; +template <> struct VecTraits { typedef int64x2x2_t vec128; typedef int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef uint64x2x2_t vec128; typedef uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; }; + +template <> struct VecTraits< u8, 3> { typedef uint8x16x3_t vec128; typedef uint8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; }; +template <> struct VecTraits< s8, 3> { typedef int8x16x3_t vec128; typedef int8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; }; +template <> struct VecTraits { typedef uint16x8x3_t vec128; typedef uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; }; +template <> struct VecTraits { typedef int16x8x3_t vec128; typedef int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; }; +template <> struct VecTraits { typedef int32x4x3_t vec128; typedef int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; }; +template <> struct VecTraits { typedef uint32x4x3_t vec128; typedef uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; }; +template <> struct VecTraits { typedef int64x2x3_t vec128; typedef int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef uint64x2x3_t vec128; typedef uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; }; + +template <> struct VecTraits< u8, 4> { typedef uint8x16x4_t vec128; typedef uint8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; }; +template <> struct VecTraits< s8, 4> { typedef int8x16x4_t vec128; typedef int8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; }; +template <> struct VecTraits { typedef uint16x8x4_t vec128; typedef uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; }; +template <> struct VecTraits { typedef int16x8x4_t vec128; typedef int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; }; +template <> struct VecTraits { typedef int32x4x4_t vec128; typedef int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; }; +template <> struct VecTraits { typedef uint32x4x4_t vec128; typedef uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; }; +template <> struct VecTraits { typedef int64x2x4_t vec128; typedef int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef uint64x2x4_t vec128; typedef uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; }; + +////////////////////////////// vld1q /////////////////////// + +inline uint8x16_t vld1q(const u8 * ptr) { return vld1q_u8(ptr); } +inline int8x16_t vld1q(const s8 * ptr) { return vld1q_s8(ptr); } +inline uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); } +inline int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); } +inline uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); } +inline int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); } +inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); } + +////////////////////////////// vld1 /////////////////////// + +inline uint8x8_t vld1(const u8 * ptr) { return vld1_u8(ptr); } +inline int8x8_t vld1(const s8 * ptr) { return vld1_s8(ptr); } +inline uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); } +inline int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); } +inline uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); } +inline int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); } +inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); } + +////////////////////////////// vld2q /////////////////////// + +inline uint8x16x2_t vld2q(const u8 * ptr) { return vld2q_u8(ptr); } +inline int8x16x2_t vld2q(const s8 * ptr) { return vld2q_s8(ptr); } +inline uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); } +inline int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); } +inline uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); } +inline int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); } +inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); } + +////////////////////////////// vld2 /////////////////////// + +inline uint8x8x2_t vld2(const u8 * ptr) { return vld2_u8(ptr); } +inline int8x8x2_t vld2(const s8 * ptr) { return vld2_s8(ptr); } +inline uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); } +inline int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); } +inline uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); } +inline int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); } +inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); } + +////////////////////////////// vld3q /////////////////////// + +inline uint8x16x3_t vld3q(const u8 * ptr) { return vld3q_u8(ptr); } +inline int8x16x3_t vld3q(const s8 * ptr) { return vld3q_s8(ptr); } +inline uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); } +inline int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); } +inline uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); } +inline int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); } +inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); } + +////////////////////////////// vld3 /////////////////////// + +inline uint8x8x3_t vld3(const u8 * ptr) { return vld3_u8(ptr); } +inline int8x8x3_t vld3(const s8 * ptr) { return vld3_s8(ptr); } +inline uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); } +inline int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); } +inline uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); } +inline int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); } +inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); } + +////////////////////////////// vld4q /////////////////////// + +inline uint8x16x4_t vld4q(const u8 * ptr) { return vld4q_u8(ptr); } +inline int8x16x4_t vld4q(const s8 * ptr) { return vld4q_s8(ptr); } +inline uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); } +inline int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); } +inline uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); } +inline int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); } +inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); } + +////////////////////////////// vld4 /////////////////////// + +inline uint8x8x4_t vld4(const u8 * ptr) { return vld4_u8(ptr); } +inline int8x8x4_t vld4(const s8 * ptr) { return vld4_s8(ptr); } +inline uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); } +inline int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); } +inline uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); } +inline int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); } +inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); } + +////////////////////////////// vst1q /////////////////////// + +inline void vst1q(u8 * ptr, const uint8x16_t & v) { return vst1q_u8(ptr, v); } +inline void vst1q(s8 * ptr, const int8x16_t & v) { return vst1q_s8(ptr, v); } +inline void vst1q(u16 * ptr, const uint16x8_t & v) { return vst1q_u16(ptr, v); } +inline void vst1q(s16 * ptr, const int16x8_t & v) { return vst1q_s16(ptr, v); } +inline void vst1q(u32 * ptr, const uint32x4_t & v) { return vst1q_u32(ptr, v); } +inline void vst1q(s32 * ptr, const int32x4_t & v) { return vst1q_s32(ptr, v); } +inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); } + +////////////////////////////// vst1 /////////////////////// + +inline void vst1(u8 * ptr, const uint8x8_t & v) { return vst1_u8(ptr, v); } +inline void vst1(s8 * ptr, const int8x8_t & v) { return vst1_s8(ptr, v); } +inline void vst1(u16 * ptr, const uint16x4_t & v) { return vst1_u16(ptr, v); } +inline void vst1(s16 * ptr, const int16x4_t & v) { return vst1_s16(ptr, v); } +inline void vst1(u32 * ptr, const uint32x2_t & v) { return vst1_u32(ptr, v); } +inline void vst1(s32 * ptr, const int32x2_t & v) { return vst1_s32(ptr, v); } +inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); } + +////////////////////////////// vst2q /////////////////////// + +inline void vst2q(u8 * ptr, const uint8x16x2_t & v) { return vst2q_u8(ptr, v); } +inline void vst2q(s8 * ptr, const int8x16x2_t & v) { return vst2q_s8(ptr, v); } +inline void vst2q(u16 * ptr, const uint16x8x2_t & v) { return vst2q_u16(ptr, v); } +inline void vst2q(s16 * ptr, const int16x8x2_t & v) { return vst2q_s16(ptr, v); } +inline void vst2q(u32 * ptr, const uint32x4x2_t & v) { return vst2q_u32(ptr, v); } +inline void vst2q(s32 * ptr, const int32x4x2_t & v) { return vst2q_s32(ptr, v); } +inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); } + +////////////////////////////// vst2 /////////////////////// + +inline void vst2(u8 * ptr, const uint8x8x2_t & v) { return vst2_u8(ptr, v); } +inline void vst2(s8 * ptr, const int8x8x2_t & v) { return vst2_s8(ptr, v); } +inline void vst2(u16 * ptr, const uint16x4x2_t & v) { return vst2_u16(ptr, v); } +inline void vst2(s16 * ptr, const int16x4x2_t & v) { return vst2_s16(ptr, v); } +inline void vst2(u32 * ptr, const uint32x2x2_t & v) { return vst2_u32(ptr, v); } +inline void vst2(s32 * ptr, const int32x2x2_t & v) { return vst2_s32(ptr, v); } +inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); } + +////////////////////////////// vst3q /////////////////////// + +inline void vst3q(u8 * ptr, const uint8x16x3_t & v) { return vst3q_u8(ptr, v); } +inline void vst3q(s8 * ptr, const int8x16x3_t & v) { return vst3q_s8(ptr, v); } +inline void vst3q(u16 * ptr, const uint16x8x3_t & v) { return vst3q_u16(ptr, v); } +inline void vst3q(s16 * ptr, const int16x8x3_t & v) { return vst3q_s16(ptr, v); } +inline void vst3q(u32 * ptr, const uint32x4x3_t & v) { return vst3q_u32(ptr, v); } +inline void vst3q(s32 * ptr, const int32x4x3_t & v) { return vst3q_s32(ptr, v); } +inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); } + +////////////////////////////// vst3 /////////////////////// + +inline void vst3(u8 * ptr, const uint8x8x3_t & v) { return vst3_u8(ptr, v); } +inline void vst3(s8 * ptr, const int8x8x3_t & v) { return vst3_s8(ptr, v); } +inline void vst3(u16 * ptr, const uint16x4x3_t & v) { return vst3_u16(ptr, v); } +inline void vst3(s16 * ptr, const int16x4x3_t & v) { return vst3_s16(ptr, v); } +inline void vst3(u32 * ptr, const uint32x2x3_t & v) { return vst3_u32(ptr, v); } +inline void vst3(s32 * ptr, const int32x2x3_t & v) { return vst3_s32(ptr, v); } +inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); } + +////////////////////////////// vst4q /////////////////////// + +inline void vst4q(u8 * ptr, const uint8x16x4_t & v) { return vst4q_u8(ptr, v); } +inline void vst4q(s8 * ptr, const int8x16x4_t & v) { return vst4q_s8(ptr, v); } +inline void vst4q(u16 * ptr, const uint16x8x4_t & v) { return vst4q_u16(ptr, v); } +inline void vst4q(s16 * ptr, const int16x8x4_t & v) { return vst4q_s16(ptr, v); } +inline void vst4q(u32 * ptr, const uint32x4x4_t & v) { return vst4q_u32(ptr, v); } +inline void vst4q(s32 * ptr, const int32x4x4_t & v) { return vst4q_s32(ptr, v); } +inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); } + +////////////////////////////// vst4 /////////////////////// + +inline void vst4(u8 * ptr, const uint8x8x4_t & v) { return vst4_u8(ptr, v); } +inline void vst4(s8 * ptr, const int8x8x4_t & v) { return vst4_s8(ptr, v); } +inline void vst4(u16 * ptr, const uint16x4x4_t & v) { return vst4_u16(ptr, v); } +inline void vst4(s16 * ptr, const int16x4x4_t & v) { return vst4_s16(ptr, v); } +inline void vst4(u32 * ptr, const uint32x2x4_t & v) { return vst4_u32(ptr, v); } +inline void vst4(s32 * ptr, const int32x2x4_t & v) { return vst4_s32(ptr, v); } +inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); } + +////////////////////////////// vabdq /////////////////////// + +inline uint8x16_t vabdq(const uint8x16_t & v0, const uint8x16_t & v1) { return vabdq_u8 (v0, v1); } +inline int8x16_t vabdq(const int8x16_t & v0, const int8x16_t & v1) { return vabdq_s8 (v0, v1); } +inline uint16x8_t vabdq(const uint16x8_t & v0, const uint16x8_t & v1) { return vabdq_u16(v0, v1); } +inline int16x8_t vabdq(const int16x8_t & v0, const int16x8_t & v1) { return vabdq_s16(v0, v1); } +inline uint32x4_t vabdq(const uint32x4_t & v0, const uint32x4_t & v1) { return vabdq_u32(v0, v1); } +inline int32x4_t vabdq(const int32x4_t & v0, const int32x4_t & v1) { return vabdq_s32(v0, v1); } +inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); } + +////////////////////////////// vabd /////////////////////// + +inline uint8x8_t vabd(const uint8x8_t & v0, const uint8x8_t & v1) { return vabd_u8 (v0, v1); } +inline int8x8_t vabd(const int8x8_t & v0, const int8x8_t & v1) { return vabd_s8 (v0, v1); } +inline uint16x4_t vabd(const uint16x4_t & v0, const uint16x4_t & v1) { return vabd_u16(v0, v1); } +inline int16x4_t vabd(const int16x4_t & v0, const int16x4_t & v1) { return vabd_s16(v0, v1); } +inline uint32x2_t vabd(const uint32x2_t & v0, const uint32x2_t & v1) { return vabd_u32(v0, v1); } +inline int32x2_t vabd(const int32x2_t & v0, const int32x2_t & v1) { return vabd_s32(v0, v1); } +inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); } + +////////////////////////////// vminq /////////////////////// + +inline uint8x16_t vminq(const uint8x16_t & v0, const uint8x16_t & v1) { return vminq_u8 (v0, v1); } +inline int8x16_t vminq(const int8x16_t & v0, const int8x16_t & v1) { return vminq_s8 (v0, v1); } +inline uint16x8_t vminq(const uint16x8_t & v0, const uint16x8_t & v1) { return vminq_u16(v0, v1); } +inline int16x8_t vminq(const int16x8_t & v0, const int16x8_t & v1) { return vminq_s16(v0, v1); } +inline uint32x4_t vminq(const uint32x4_t & v0, const uint32x4_t & v1) { return vminq_u32(v0, v1); } +inline int32x4_t vminq(const int32x4_t & v0, const int32x4_t & v1) { return vminq_s32(v0, v1); } +inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); } + +////////////////////////////// vmin /////////////////////// + +inline uint8x8_t vmin(const uint8x8_t & v0, const uint8x8_t & v1) { return vmin_u8 (v0, v1); } +inline int8x8_t vmin(const int8x8_t & v0, const int8x8_t & v1) { return vmin_s8 (v0, v1); } +inline uint16x4_t vmin(const uint16x4_t & v0, const uint16x4_t & v1) { return vmin_u16(v0, v1); } +inline int16x4_t vmin(const int16x4_t & v0, const int16x4_t & v1) { return vmin_s16(v0, v1); } +inline uint32x2_t vmin(const uint32x2_t & v0, const uint32x2_t & v1) { return vmin_u32(v0, v1); } +inline int32x2_t vmin(const int32x2_t & v0, const int32x2_t & v1) { return vmin_s32(v0, v1); } +inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); } + +////////////////////////////// vmaxq /////////////////////// + +inline uint8x16_t vmaxq(const uint8x16_t & v0, const uint8x16_t & v1) { return vmaxq_u8 (v0, v1); } +inline int8x16_t vmaxq(const int8x16_t & v0, const int8x16_t & v1) { return vmaxq_s8 (v0, v1); } +inline uint16x8_t vmaxq(const uint16x8_t & v0, const uint16x8_t & v1) { return vmaxq_u16(v0, v1); } +inline int16x8_t vmaxq(const int16x8_t & v0, const int16x8_t & v1) { return vmaxq_s16(v0, v1); } +inline uint32x4_t vmaxq(const uint32x4_t & v0, const uint32x4_t & v1) { return vmaxq_u32(v0, v1); } +inline int32x4_t vmaxq(const int32x4_t & v0, const int32x4_t & v1) { return vmaxq_s32(v0, v1); } +inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); } + +////////////////////////////// vmax /////////////////////// + +inline uint8x8_t vmax(const uint8x8_t & v0, const uint8x8_t & v1) { return vmax_u8 (v0, v1); } +inline int8x8_t vmax(const int8x8_t & v0, const int8x8_t & v1) { return vmax_s8 (v0, v1); } +inline uint16x4_t vmax(const uint16x4_t & v0, const uint16x4_t & v1) { return vmax_u16(v0, v1); } +inline int16x4_t vmax(const int16x4_t & v0, const int16x4_t & v1) { return vmax_s16(v0, v1); } +inline uint32x2_t vmax(const uint32x2_t & v0, const uint32x2_t & v1) { return vmax_u32(v0, v1); } +inline int32x2_t vmax(const int32x2_t & v0, const int32x2_t & v1) { return vmax_s32(v0, v1); } +inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); } + +////////////////////////////// vdupq_n /////////////////////// + +inline uint8x16_t vdupq_n(const u8 & val) { return vdupq_n_u8(val); } +inline int8x16_t vdupq_n(const s8 & val) { return vdupq_n_s8(val); } +inline uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); } +inline int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); } +inline uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); } +inline int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); } +inline uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); } +inline int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); } +inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); } + +////////////////////////////// vdup_n /////////////////////// + +inline uint8x8_t vdup_n(const u8 & val) { return vdup_n_u8(val); } +inline int8x8_t vdup_n(const s8 & val) { return vdup_n_s8(val); } +inline uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); } +inline int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); } +inline uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); } +inline int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); } +inline uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); } +inline int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); } +inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); } + +////////////////////////////// vget_low /////////////////////// + +inline uint8x8_t vget_low(const uint8x16_t & v) { return vget_low_u8 (v); } +inline int8x8_t vget_low(const int8x16_t & v) { return vget_low_s8 (v); } +inline uint16x4_t vget_low(const uint16x8_t & v) { return vget_low_u16(v); } +inline int16x4_t vget_low(const int16x8_t & v) { return vget_low_s16(v); } +inline uint32x2_t vget_low(const uint32x4_t & v) { return vget_low_u32(v); } +inline int32x2_t vget_low(const int32x4_t & v) { return vget_low_s32(v); } +inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); } + +////////////////////////////// vget_high /////////////////////// + +inline uint8x8_t vget_high(const uint8x16_t & v) { return vget_high_u8 (v); } +inline int8x8_t vget_high(const int8x16_t & v) { return vget_high_s8 (v); } +inline uint16x4_t vget_high(const uint16x8_t & v) { return vget_high_u16(v); } +inline int16x4_t vget_high(const int16x8_t & v) { return vget_high_s16(v); } +inline uint32x2_t vget_high(const uint32x4_t & v) { return vget_high_u32(v); } +inline int32x2_t vget_high(const int32x4_t & v) { return vget_high_s32(v); } +inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); } + +////////////////////////////// vcombine /////////////////////// + +inline uint8x16_t vcombine(const uint8x8_t & v0, const uint8x8_t & v1) { return vcombine_u8 (v0, v1); } +inline int8x16_t vcombine(const int8x8_t & v0, const int8x8_t & v1) { return vcombine_s8 (v0, v1); } +inline uint16x8_t vcombine(const uint16x4_t & v0, const uint16x4_t & v1) { return vcombine_u16(v0, v1); } +inline int16x8_t vcombine(const int16x4_t & v0, const int16x4_t & v1) { return vcombine_s16(v0, v1); } +inline uint32x4_t vcombine(const uint32x2_t & v0, const uint32x2_t & v1) { return vcombine_u32(v0, v1); } +inline int32x4_t vcombine(const int32x2_t & v0, const int32x2_t & v1) { return vcombine_s32(v0, v1); } +inline float32x4_t vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); } + +////////////////////////////// vaddq /////////////////////// + +inline uint8x16_t vaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vaddq_u8 (v0, v1); } +inline int8x16_t vaddq(const int8x16_t & v0, const int8x16_t & v1) { return vaddq_s8 (v0, v1); } +inline uint16x8_t vaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vaddq_u16(v0, v1); } +inline int16x8_t vaddq(const int16x8_t & v0, const int16x8_t & v1) { return vaddq_s16(v0, v1); } +inline uint32x4_t vaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vaddq_u32(v0, v1); } +inline int32x4_t vaddq(const int32x4_t & v0, const int32x4_t & v1) { return vaddq_s32(v0, v1); } +inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); } + +////////////////////////////// vadd /////////////////////// + +inline uint8x8_t vadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vadd_u8 (v0, v1); } +inline int8x8_t vadd(const int8x8_t & v0, const int8x8_t & v1) { return vadd_s8 (v0, v1); } +inline uint16x4_t vadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vadd_u16(v0, v1); } +inline int16x4_t vadd(const int16x4_t & v0, const int16x4_t & v1) { return vadd_s16(v0, v1); } +inline uint32x2_t vadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vadd_u32(v0, v1); } +inline int32x2_t vadd(const int32x2_t & v0, const int32x2_t & v1) { return vadd_s32(v0, v1); } +inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); } + +////////////////////////////// vqaddq /////////////////////// + +inline uint8x16_t vqaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqaddq_u8 (v0, v1); } +inline int8x16_t vqaddq(const int8x16_t & v0, const int8x16_t & v1) { return vqaddq_s8 (v0, v1); } +inline uint16x8_t vqaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqaddq_u16(v0, v1); } +inline int16x8_t vqaddq(const int16x8_t & v0, const int16x8_t & v1) { return vqaddq_s16(v0, v1); } +inline uint32x4_t vqaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqaddq_u32(v0, v1); } +inline int32x4_t vqaddq(const int32x4_t & v0, const int32x4_t & v1) { return vqaddq_s32(v0, v1); } + +////////////////////////////// vqadd /////////////////////// + +inline uint8x8_t vqadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vqadd_u8 (v0, v1); } +inline int8x8_t vqadd(const int8x8_t & v0, const int8x8_t & v1) { return vqadd_s8 (v0, v1); } +inline uint16x4_t vqadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vqadd_u16(v0, v1); } +inline int16x4_t vqadd(const int16x4_t & v0, const int16x4_t & v1) { return vqadd_s16(v0, v1); } +inline uint32x2_t vqadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vqadd_u32(v0, v1); } +inline int32x2_t vqadd(const int32x2_t & v0, const int32x2_t & v1) { return vqadd_s32(v0, v1); } + +////////////////////////////// vsubq /////////////////////// + +inline uint8x16_t vsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vsubq_u8 (v0, v1); } +inline int8x16_t vsubq(const int8x16_t & v0, const int8x16_t & v1) { return vsubq_s8 (v0, v1); } +inline uint16x8_t vsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vsubq_u16(v0, v1); } +inline int16x8_t vsubq(const int16x8_t & v0, const int16x8_t & v1) { return vsubq_s16(v0, v1); } +inline uint32x4_t vsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vsubq_u32(v0, v1); } +inline int32x4_t vsubq(const int32x4_t & v0, const int32x4_t & v1) { return vsubq_s32(v0, v1); } +inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); } + +////////////////////////////// vsub /////////////////////// + +inline uint8x8_t vsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vsub_u8 (v0, v1); } +inline int8x8_t vsub(const int8x8_t & v0, const int8x8_t & v1) { return vsub_s8 (v0, v1); } +inline uint16x4_t vsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vsub_u16(v0, v1); } +inline int16x4_t vsub(const int16x4_t & v0, const int16x4_t & v1) { return vsub_s16(v0, v1); } +inline uint32x2_t vsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vsub_u32(v0, v1); } +inline int32x2_t vsub(const int32x2_t & v0, const int32x2_t & v1) { return vsub_s32(v0, v1); } +inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); } + +////////////////////////////// vqsubq /////////////////////// + +inline uint8x16_t vqsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqsubq_u8 (v0, v1); } +inline int8x16_t vqsubq(const int8x16_t & v0, const int8x16_t & v1) { return vqsubq_s8 (v0, v1); } +inline uint16x8_t vqsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqsubq_u16(v0, v1); } +inline int16x8_t vqsubq(const int16x8_t & v0, const int16x8_t & v1) { return vqsubq_s16(v0, v1); } +inline uint32x4_t vqsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqsubq_u32(v0, v1); } +inline int32x4_t vqsubq(const int32x4_t & v0, const int32x4_t & v1) { return vqsubq_s32(v0, v1); } +inline uint64x2_t vqsubq(const uint64x2_t & v0, const uint64x2_t & v1) { return vqsubq_u64(v0, v1); } +inline int64x2_t vqsubq(const int64x2_t & v0, const int64x2_t & v1) { return vqsubq_s64(v0, v1); } + +////////////////////////////// vqsub /////////////////////// + +inline uint8x8_t vqsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vqsub_u8 (v0, v1); } +inline int8x8_t vqsub(const int8x8_t & v0, const int8x8_t & v1) { return vqsub_s8 (v0, v1); } +inline uint16x4_t vqsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vqsub_u16(v0, v1); } +inline int16x4_t vqsub(const int16x4_t & v0, const int16x4_t & v1) { return vqsub_s16(v0, v1); } +inline uint32x2_t vqsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vqsub_u32(v0, v1); } +inline int32x2_t vqsub(const int32x2_t & v0, const int32x2_t & v1) { return vqsub_s32(v0, v1); } +inline uint64x1_t vqsub(const uint64x1_t & v0, const uint64x1_t & v1) { return vqsub_u64(v0, v1); } +inline int64x1_t vqsub(const int64x1_t & v0, const int64x1_t & v1) { return vqsub_s64(v0, v1); } + +////////////////////////////// vmull /////////////////////// + +inline uint16x8_t vmull(const uint8x8_t & v0, const uint8x8_t & v1) { return vmull_u8 (v0, v1); } +inline int16x8_t vmull(const int8x8_t & v0, const int8x8_t & v1) { return vmull_s8 (v0, v1); } +inline uint32x4_t vmull(const uint16x4_t & v0, const uint16x4_t & v1) { return vmull_u16(v0, v1); } +inline int32x4_t vmull(const int16x4_t & v0, const int16x4_t & v1) { return vmull_s16(v0, v1); } +inline uint64x2_t vmull(const uint32x2_t & v0, const uint32x2_t & v1) { return vmull_u32(v0, v1); } +inline int64x2_t vmull(const int32x2_t & v0, const int32x2_t & v1) { return vmull_s32(v0, v1); } + +////////////////////////////// vrev64q /////////////////////// + +inline uint8x16_t vrev64q(const uint8x16_t & v) { return vrev64q_u8 (v); } +inline int8x16_t vrev64q(const int8x16_t & v) { return vrev64q_s8 (v); } +inline uint16x8_t vrev64q(const uint16x8_t & v) { return vrev64q_u16(v); } +inline int16x8_t vrev64q(const int16x8_t & v) { return vrev64q_s16(v); } +inline uint32x4_t vrev64q(const uint32x4_t & v) { return vrev64q_u32(v); } +inline int32x4_t vrev64q(const int32x4_t & v) { return vrev64q_s32(v); } +inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); } + +////////////////////////////// vrev64 /////////////////////// + +inline uint8x8_t vrev64(const uint8x8_t & v) { return vrev64_u8 (v); } +inline int8x8_t vrev64(const int8x8_t & v) { return vrev64_s8 (v); } +inline uint16x4_t vrev64(const uint16x4_t & v) { return vrev64_u16(v); } +inline int16x4_t vrev64(const int16x4_t & v) { return vrev64_s16(v); } +inline uint32x2_t vrev64(const uint32x2_t & v) { return vrev64_u32(v); } +inline int32x2_t vrev64(const int32x2_t & v) { return vrev64_s32(v); } +inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); } + +////////////////////////////// vceqq /////////////////////// + +inline uint8x16_t vceqq(const uint8x16_t & v0, const uint8x16_t & v1) { return vceqq_u8 (v0, v1); } +inline uint8x16_t vceqq(const int8x16_t & v0, const int8x16_t & v1) { return vceqq_s8 (v0, v1); } +inline uint16x8_t vceqq(const uint16x8_t & v0, const uint16x8_t & v1) { return vceqq_u16(v0, v1); } +inline uint16x8_t vceqq(const int16x8_t & v0, const int16x8_t & v1) { return vceqq_s16(v0, v1); } +inline uint32x4_t vceqq(const uint32x4_t & v0, const uint32x4_t & v1) { return vceqq_u32(v0, v1); } +inline uint32x4_t vceqq(const int32x4_t & v0, const int32x4_t & v1) { return vceqq_s32(v0, v1); } +inline uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); } + +////////////////////////////// vceq /////////////////////// + +inline uint8x8_t vceq(const uint8x8_t & v0, const uint8x8_t & v1) { return vceq_u8 (v0, v1); } +inline uint8x8_t vceq(const int8x8_t & v0, const int8x8_t & v1) { return vceq_s8 (v0, v1); } +inline uint16x4_t vceq(const uint16x4_t & v0, const uint16x4_t & v1) { return vceq_u16(v0, v1); } +inline uint16x4_t vceq(const int16x4_t & v0, const int16x4_t & v1) { return vceq_s16(v0, v1); } +inline uint32x2_t vceq(const uint32x2_t & v0, const uint32x2_t & v1) { return vceq_u32(v0, v1); } +inline uint32x2_t vceq(const int32x2_t & v0, const int32x2_t & v1) { return vceq_s32(v0, v1); } +inline uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); } + +////////////////////////////// vcgtq /////////////////////// + +inline uint8x16_t vcgtq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgtq_u8 (v0, v1); } +inline uint8x16_t vcgtq(const int8x16_t & v0, const int8x16_t & v1) { return vcgtq_s8 (v0, v1); } +inline uint16x8_t vcgtq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgtq_u16(v0, v1); } +inline uint16x8_t vcgtq(const int16x8_t & v0, const int16x8_t & v1) { return vcgtq_s16(v0, v1); } +inline uint32x4_t vcgtq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgtq_u32(v0, v1); } +inline uint32x4_t vcgtq(const int32x4_t & v0, const int32x4_t & v1) { return vcgtq_s32(v0, v1); } +inline uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); } + +////////////////////////////// vcgt /////////////////////// + +inline uint8x8_t vcgt(const uint8x8_t & v0, const uint8x8_t & v1) { return vcgt_u8 (v0, v1); } +inline uint8x8_t vcgt(const int8x8_t & v0, const int8x8_t & v1) { return vcgt_s8 (v0, v1); } +inline uint16x4_t vcgt(const uint16x4_t & v0, const uint16x4_t & v1) { return vcgt_u16(v0, v1); } +inline uint16x4_t vcgt(const int16x4_t & v0, const int16x4_t & v1) { return vcgt_s16(v0, v1); } +inline uint32x2_t vcgt(const uint32x2_t & v0, const uint32x2_t & v1) { return vcgt_u32(v0, v1); } +inline uint32x2_t vcgt(const int32x2_t & v0, const int32x2_t & v1) { return vcgt_s32(v0, v1); } +inline uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); } + +////////////////////////////// vcgeq /////////////////////// + +inline uint8x16_t vcgeq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgeq_u8 (v0, v1); } +inline uint8x16_t vcgeq(const int8x16_t & v0, const int8x16_t & v1) { return vcgeq_s8 (v0, v1); } +inline uint16x8_t vcgeq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgeq_u16(v0, v1); } +inline uint16x8_t vcgeq(const int16x8_t & v0, const int16x8_t & v1) { return vcgeq_s16(v0, v1); } +inline uint32x4_t vcgeq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgeq_u32(v0, v1); } +inline uint32x4_t vcgeq(const int32x4_t & v0, const int32x4_t & v1) { return vcgeq_s32(v0, v1); } +inline uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); } + +////////////////////////////// vcge /////////////////////// + +inline uint8x8_t vcge(const uint8x8_t & v0, const uint8x8_t & v1) { return vcge_u8 (v0, v1); } +inline uint8x8_t vcge(const int8x8_t & v0, const int8x8_t & v1) { return vcge_s8 (v0, v1); } +inline uint16x4_t vcge(const uint16x4_t & v0, const uint16x4_t & v1) { return vcge_u16(v0, v1); } +inline uint16x4_t vcge(const int16x4_t & v0, const int16x4_t & v1) { return vcge_s16(v0, v1); } +inline uint32x2_t vcge(const uint32x2_t & v0, const uint32x2_t & v1) { return vcge_u32(v0, v1); } +inline uint32x2_t vcge(const int32x2_t & v0, const int32x2_t & v1) { return vcge_s32(v0, v1); } +inline uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); } + +////////////////////////////// vandq /////////////////////// + +inline uint8x16_t vandq(const uint8x16_t & v0, const uint8x16_t & v1) { return vandq_u8 (v0, v1); } +inline int8x16_t vandq(const int8x16_t & v0, const int8x16_t & v1) { return vandq_s8 (v0, v1); } +inline uint16x8_t vandq(const uint16x8_t & v0, const uint16x8_t & v1) { return vandq_u16(v0, v1); } +inline int16x8_t vandq(const int16x8_t & v0, const int16x8_t & v1) { return vandq_s16(v0, v1); } +inline uint32x4_t vandq(const uint32x4_t & v0, const uint32x4_t & v1) { return vandq_u32(v0, v1); } +inline int32x4_t vandq(const int32x4_t & v0, const int32x4_t & v1) { return vandq_s32(v0, v1); } + +////////////////////////////// vand /////////////////////// + +inline uint8x8_t vand(const uint8x8_t & v0, const uint8x8_t & v1) { return vand_u8 (v0, v1); } +inline int8x8_t vand(const int8x8_t & v0, const int8x8_t & v1) { return vand_s8 (v0, v1); } +inline uint16x4_t vand(const uint16x4_t & v0, const uint16x4_t & v1) { return vand_u16(v0, v1); } +inline int16x4_t vand(const int16x4_t & v0, const int16x4_t & v1) { return vand_s16(v0, v1); } +inline uint32x2_t vand(const uint32x2_t & v0, const uint32x2_t & v1) { return vand_u32(v0, v1); } +inline int32x2_t vand(const int32x2_t & v0, const int32x2_t & v1) { return vand_s32(v0, v1); } + +////////////////////////////// vmovn /////////////////////// + +inline uint8x8_t vmovn(const uint16x8_t & v) { return vmovn_u16(v); } +inline int8x8_t vmovn(const int16x8_t & v) { return vmovn_s16(v); } +inline uint16x4_t vmovn(const uint32x4_t & v) { return vmovn_u32(v); } +inline int16x4_t vmovn(const int32x4_t & v) { return vmovn_s32(v); } +inline uint32x2_t vmovn(const uint64x2_t & v) { return vmovn_u64(v); } +inline int32x2_t vmovn(const int64x2_t & v) { return vmovn_s64(v); } + +////////////////////////////// vqmovn /////////////////////// + +inline uint8x8_t vqmovn(const uint16x8_t & v) { return vqmovn_u16(v); } +inline int8x8_t vqmovn(const int16x8_t & v) { return vqmovn_s16(v); } +inline uint16x4_t vqmovn(const uint32x4_t & v) { return vqmovn_u32(v); } +inline int16x4_t vqmovn(const int32x4_t & v) { return vqmovn_s32(v); } +inline uint32x2_t vqmovn(const uint64x2_t & v) { return vqmovn_u64(v); } +inline int32x2_t vqmovn(const int64x2_t & v) { return vqmovn_s64(v); } + +////////////////////////////// vmovl /////////////////////// + +inline uint16x8_t vmovl(const uint8x8_t & v) { return vmovl_u8(v); } +inline int16x8_t vmovl(const int8x8_t & v) { return vmovl_s8(v); } +inline uint32x4_t vmovl(const uint16x4_t & v) { return vmovl_u16(v); } +inline int32x4_t vmovl(const int16x4_t & v) { return vmovl_s16(v); } + +////////////////////////////// vmvnq /////////////////////// + +inline uint8x16_t vmvnq(const uint8x16_t & v) { return vmvnq_u8 (v); } +inline int8x16_t vmvnq(const int8x16_t & v) { return vmvnq_s8 (v); } +inline uint16x8_t vmvnq(const uint16x8_t & v) { return vmvnq_u16(v); } +inline int16x8_t vmvnq(const int16x8_t & v) { return vmvnq_s16(v); } +inline uint32x4_t vmvnq(const uint32x4_t & v) { return vmvnq_u32(v); } +inline int32x4_t vmvnq(const int32x4_t & v) { return vmvnq_s32(v); } + +////////////////////////////// vmvn /////////////////////// + +inline uint8x8_t vmvn(const uint8x8_t & v) { return vmvn_u8 (v); } +inline int8x8_t vmvn(const int8x8_t & v) { return vmvn_s8 (v); } +inline uint16x4_t vmvn(const uint16x4_t & v) { return vmvn_u16(v); } +inline int16x4_t vmvn(const int16x4_t & v) { return vmvn_s16(v); } +inline uint32x2_t vmvn(const uint32x2_t & v) { return vmvn_u32(v); } +inline int32x2_t vmvn(const int32x2_t & v) { return vmvn_s32(v); } + +////////////////////////////// vbicq /////////////////////// + +inline uint8x16_t vbicq(const uint8x16_t & v0, const uint8x16_t & v1) { return vbicq_u8 (v0, v1); } +inline int8x16_t vbicq(const int8x16_t & v0, const int8x16_t & v1) { return vbicq_s8 (v0, v1); } +inline uint16x8_t vbicq(const uint16x8_t & v0, const uint16x8_t & v1) { return vbicq_u16(v0, v1); } +inline int16x8_t vbicq(const int16x8_t & v0, const int16x8_t & v1) { return vbicq_s16(v0, v1); } +inline uint32x4_t vbicq(const uint32x4_t & v0, const uint32x4_t & v1) { return vbicq_u32(v0, v1); } +inline int32x4_t vbicq(const int32x4_t & v0, const int32x4_t & v1) { return vbicq_s32(v0, v1); } +inline uint64x2_t vbicq(const uint64x2_t & v0, const uint64x2_t & v1) { return vbicq_u64(v0, v1); } +inline int64x2_t vbicq(const int64x2_t & v0, const int64x2_t & v1) { return vbicq_s64(v0, v1); } + +////////////////////////////// vbic /////////////////////// + +inline uint8x8_t vbic(const uint8x8_t & v0, const uint8x8_t & v1) { return vbic_u8 (v0, v1); } +inline int8x8_t vbic(const int8x8_t & v0, const int8x8_t & v1) { return vbic_s8 (v0, v1); } +inline uint16x4_t vbic(const uint16x4_t & v0, const uint16x4_t & v1) { return vbic_u16(v0, v1); } +inline int16x4_t vbic(const int16x4_t & v0, const int16x4_t & v1) { return vbic_s16(v0, v1); } +inline uint32x2_t vbic(const uint32x2_t & v0, const uint32x2_t & v1) { return vbic_u32(v0, v1); } +inline int32x2_t vbic(const int32x2_t & v0, const int32x2_t & v1) { return vbic_s32(v0, v1); } +inline uint64x1_t vbic(const uint64x1_t & v0, const uint64x1_t & v1) { return vbic_u64(v0, v1); } +inline int64x1_t vbic(const int64x1_t & v0, const int64x1_t & v1) { return vbic_s64(v0, v1); } + +////////////////////////////// vtransform /////////////////////// + +template +void vtransform(Size2D size, + const typename Op::type * src0Base, ptrdiff_t src0Stride, + const typename Op::type * src1Base, ptrdiff_t src1Stride, + typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op) +{ + typedef typename Op::type type; + typedef typename VecTraits::vec128 vec128; + typedef typename VecTraits::vec64 vec64; + + if (src0Stride == src1Stride && src0Stride == dstStride && + src0Stride == (ptrdiff_t)(size.width * sizeof(type))) + { + size.width *= size.height; + size.height = 1; + } + + const size_t step_base = 32 / sizeof(type); + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; + const size_t step_tail = 8 / sizeof(type); + size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0; + + for (size_t y = 0; y < size.height; ++y) + { + const type * src0 = internal::getRowPtr(src0Base, src0Stride, y); + const type * src1 = internal::getRowPtr(src1Base, src1Stride, y); + typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y); + size_t x = 0; + + for( ; x < roiw_base; x += step_base ) + { + internal::prefetch(src0 + x); + internal::prefetch(src1 + x); + + vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type)); + vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type)); + vec128 v_dst; + + op(v_src00, v_src10, v_dst); + vst1q(dst + x, v_dst); + + op(v_src01, v_src11, v_dst); + vst1q(dst + x + 16 / sizeof(type), v_dst); + } + for( ; x < roiw_tail; x += step_tail ) + { + vec64 v_src0 = vld1(src0 + x); + vec64 v_src1 = vld1(src1 + x); + vec64 v_dst; + + op(v_src0, v_src1, v_dst); + vst1(dst + x, v_dst); + } + + for (; x < size.width; ++x) + { + op(src0 + x, src1 + x, dst + x); + } + } +} + +} } + +#endif // CAROTENE_NEON + +#endif diff --git a/3rdparty/carotene/src/warp_affine.cpp b/3rdparty/carotene/src/warp_affine.cpp new file mode 100644 index 0000000000..d546efbc10 --- /dev/null +++ b/3rdparty/carotene/src/warp_affine.cpp @@ -0,0 +1,434 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "remap.hpp" + +namespace CAROTENE_NS { + +bool isWarpAffineNearestNeighborSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +bool isWarpAffineLinearSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + s32 * map = alignPtr(_map, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride); + float32x4_t v_4 = vdupq_n_f32(4.0f); + + float32x4_t v_m0 = vdupq_n_f32(m[0]); + float32x4_t v_m1 = vdupq_n_f32(m[1]); + float32x4_t v_m2 = vdupq_n_f32(m[2]); + float32x4_t v_m3 = vdupq_n_f32(m[3]); + float32x4_t v_m4 = vdupq_n_f32(m[4]); + float32x4_t v_m5 = vdupq_n_f32(m[5]); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf))); + int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf))); + int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 src_x_f = m[0] * x_ + yx; + f32 src_y_f = m[1] * x_ + yy; + s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f); + + src_x = std::max(0, std::min(ssize.width - 1, src_x)); + src_y = std::max(0, std::min(ssize.height - 1, src_y)); + map_row[x] = src_y * srcStride + src_x; + } + } + + // make remap + remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + int32x4_t v_m1_4 = vdupq_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)), + vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4))); + int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 src_x_f = m[0] * x_ + yx; + f32 src_y_f = m[1] * x_ + yy; + s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f); + + map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) && + (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1; + } + } + + // make remap + remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)m; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +void warpAffineLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + s32 * map = alignPtr(_map, 16); + f32 * coeffs = alignPtr(_coeffs, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + float32x4_t v_m0 = vdupq_n_f32(m[0]); + float32x4_t v_m1 = vdupq_n_f32(m[1]); + float32x4_t v_m2 = vdupq_n_f32(m[2]); + float32x4_t v_m3 = vdupq_n_f32(m[3]); + float32x4_t v_m4 = vdupq_n_f32(m[4]); + float32x4_t v_m5 = vdupq_n_f32(m[5]); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x); + v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y); + + int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x)); + int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y)); + int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x))); + int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y))); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 src_x_f = m[0] * x_ + yx; + f32 src_y_f = m[1] * x_ + yy; + + s32 src0_x = (s32)floorf(src_x_f); + s32 src0_y = (s32)floorf(src_y_f); + + coeff_row[(x << 1) + 0] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + s32 src1_y = std::max(0, std::min(ssize.height - 1, src0_y + 1)); + src0_y = std::max(0, std::min(ssize.height - 1, src0_y)); + s32 src1_x = std::max(0, std::min(ssize.width - 1, src0_x + 1)); + src0_x = std::max(0, std::min(ssize.width - 1, src0_x)); + + map_row[(x << 2) + 0] = src0_y * srcStride + src0_x; + map_row[(x << 2) + 1] = src0_y * srcStride + src1_x; + map_row[(x << 2) + 2] = src1_y * srcStride + src0_x; + map_row[(x << 2) + 3] = src1_y * srcStride + src1_x; + } + } + + remapLinearReplicate(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 src_x_f = m[0] * x_ + yx; + f32 src_y_f = m[1] * x_ + yy; + + s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1; + s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1; + + coeff_row[(x << 1) + 0] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1; + map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1; + map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1; + map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1; + } + } + + remapLinearConst(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)m; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/warp_perspective.cpp b/3rdparty/carotene/src/warp_perspective.cpp new file mode 100644 index 0000000000..4437661413 --- /dev/null +++ b/3rdparty/carotene/src/warp_perspective.cpp @@ -0,0 +1,464 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + + + +#include "remap.hpp" + +namespace CAROTENE_NS { + +bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +bool isWarpPerspectiveLinearSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + s32 * map = alignPtr(_map, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride); + float32x4_t v_4 = vdupq_n_f32(4.0f); + + float32x4_t v_m0 = vdupq_n_f32(m[0]); + float32x4_t v_m1 = vdupq_n_f32(m[1]); + float32x4_t v_m2 = vdupq_n_f32(m[2]); + float32x4_t v_m3 = vdupq_n_f32(m[3]); + float32x4_t v_m4 = vdupq_n_f32(m[4]); + float32x4_t v_m5 = vdupq_n_f32(m[5]); + float32x4_t v_m6 = vdupq_n_f32(m[6]); + float32x4_t v_m7 = vdupq_n_f32(m[7]); + float32x4_t v_m8 = vdupq_n_f32(m[8]); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf))); + int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf))); + int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 w_f = 1.0f / (m[2] * x_ + yw); + f32 src_x_f = (m[0] * x_ + yx) * w_f; + f32 src_y_f = (m[1] * x_ + yy) * w_f; + s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f); + + src_x = std::max(0, std::min(ssize.width - 1, src_x)); + src_y = std::max(0, std::min(ssize.height - 1, src_y)); + map_row[x] = src_y * srcStride + src_x; + } + } + + // make remap + remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + int32x4_t v_m1_4 = vdupq_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)), + vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4))); + int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 w_f = 1.0f / (m[2] * x_ + yw); + f32 src_x_f = (m[0] * x_ + yx) * w_f; + f32 src_y_f = (m[1] * x_ + yy) * w_f; + s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f); + + map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) && + (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1; + } + } + + // make remap + remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)m; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + s32 * map = alignPtr(_map, 16); + f32 * coeffs = alignPtr(_coeffs, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + float32x4_t v_4 = vdupq_n_f32(4.0f); + + float32x4_t v_m0 = vdupq_n_f32(m[0]); + float32x4_t v_m1 = vdupq_n_f32(m[1]); + float32x4_t v_m2 = vdupq_n_f32(m[2]); + float32x4_t v_m3 = vdupq_n_f32(m[3]); + float32x4_t v_m4 = vdupq_n_f32(m[4]); + float32x4_t v_m5 = vdupq_n_f32(m[5]); + float32x4_t v_m6 = vdupq_n_f32(m[6]); + float32x4_t v_m7 = vdupq_n_f32(m[7]); + float32x4_t v_m8 = vdupq_n_f32(m[8]); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x); + v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y); + + int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x)); + int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y)); + int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x))); + int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y))); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 w_f = 1.0f / (m[2] * x_ + yw); + f32 src_x_f = (m[0] * x_ + yx) * w_f; + f32 src_y_f = (m[1] * x_ + yy) * w_f; + + s32 src0_x = (s32)floorf(src_x_f); + s32 src0_y = (s32)floorf(src_y_f); + + coeff_row[(x << 1) + 0] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + s32 src1_y = std::max(0, std::min(ssize.height - 1, src0_y + 1)); + src0_y = std::max(0, std::min(ssize.height - 1, src0_y)); + s32 src1_x = std::max(0, std::min(ssize.width - 1, src0_x + 1)); + src0_x = std::max(0, std::min(ssize.width - 1, src0_x)); + + map_row[(x << 2) + 0] = src0_y * srcStride + src0_x; + map_row[(x << 2) + 1] = src0_y * srcStride + src1_x; + map_row[(x << 2) + 2] = src1_y * srcStride + src0_x; + map_row[(x << 2) + 3] = src1_y * srcStride + src1_x; + } + } + + remapLinearReplicate(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 w_f = 1.0f / (m[2] * x_ + yw); + f32 src_x_f = (m[0] * x_ + yx) * w_f; + f32 src_y_f = (m[1] * x_ + yy) * w_f; + + s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1; + s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1; + + coeff_row[(x << 1) + 0] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1; + map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1; + map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1; + map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1; + } + } + + remapLinearConst(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)m; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS