Add Carotene - NVIDIA Hardware-Abstraction-Layer for ARM platforms

This commit is contained in:
Elif Albuz 2016-07-04 23:56:15 -07:00
parent c65d2a0d86
commit 8f91529edf
63 changed files with 39816 additions and 0 deletions

8
3rdparty/carotene/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Gedit temp files
*~
# Qt Creator file
*.user
# MacOS-specific (Desktop Services Store)
.DS_Store

42
3rdparty/carotene/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,42 @@
cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
project(Carotene)
set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions")
set(CAROTENE_INCLUDE_DIR include)
set(CAROTENE_SOURCE_DIR src)
file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp")
file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp"
"${CAROTENE_SOURCE_DIR}/*.hpp")
include_directories(${CAROTENE_INCLUDE_DIR})
if(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}")
# allow more inlines - these parameters improve performance for:
# - matchTemplate about 5-10%
# - goodFeaturesToTrack 10-20%
# - cornerHarris 30% for some cases
set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
add_library(carotene_objs OBJECT
${carotene_headers}
${carotene_sources}
)
if(NOT CAROTENE_NS STREQUAL "carotene")
target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}")
endif()
if(WITH_NEON)
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
endif()
set_target_properties(carotene_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")

2
3rdparty/carotene/README.md vendored Normal file
View File

@ -0,0 +1,2 @@
This is Carotene, a low-level library containing optimized CPU routines
that are useful for computer vision algorithms.

137
3rdparty/carotene/hal/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,137 @@
cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM TRUE)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*")
set(AARCH64 TRUE)
endif()
if(ANDROID AND ARM)
set(WITH_TGPU ON CACHE BOOL "Enable Tegra GPGPU optimization")
endif()
set(TEGRA_COMPILER_FLAGS "")
if(CMAKE_COMPILER_IS_GNUCXX)
# Generate unwind information even for functions that can't throw/propagate exceptions.
# This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols.
list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables)
endif()
if(CMAKE_COMPILER_IS_GNUCXX)
if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6"))
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
else()
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces
-fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
endif()
if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" )
list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange
-floop-strip-mine -floop-parallelize-all -ftree-loop-linear)
endif()
endif()
if(ARM OR AARCH64)
set(CHECK_TEGRA_HARDWARE_DEFAULT ON)
else()
set(CHECK_TEGRA_HARDWARE_DEFAULT OFF)
endif()
set(CHECK_TEGRA_HARDWARE ${CHECK_TEGRA_HARDWARE_DEFAULT} CACHE BOOL
"Verify Tegra platform before running optimized code")
string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}")
if(ANDROID_NATIVE_API_LEVEL LESS 9 AND (WITH_TGPU OR CHECK_TEGRA_HARDWARE))
message(FATAL_ERROR "GPU support and Hardware detector is not available for API levels below 9.
Please disable Tegra GPU support and hardware detection or configure project for API level 9 or above.")
endif()
if(ARMEABI_V7A)
if (CMAKE_COMPILER_IS_GNUCXX)
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" )
endif()
endif()
if (CHECK_TEGRA_HARDWARE)
add_definitions(-DCHECK_TEGRA_HARDWARE)
endif()
if(WITH_TGPU)
add_definitions(-DHAVE_TGPU)
endif()
if(WITH_LOGS)
add_definitions(-DHAVE_LOGS)
endif()
set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE)
function(compile_carotene)
if(ENABLE_NEON)
set(WITH_NEON ON)
endif()
add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene")
if(ARM OR AARCH64)
if(CMAKE_BUILD_TYPE)
set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
endif()
check_cxx_compiler_flag("-mfpu=neon" CXX_HAS_MFPU_NEON)
check_c_compiler_flag("-mfpu=neon" C_HAS_MFPU_NEON)
if(${CXX_HAS_MFPU_NEON} AND ${C_HAS_MFPU_NEON})
get_target_property(old_flags "carotene_objs" COMPILE_FLAGS)
if(old_flags)
set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "${old_flags} -mfpu=neon")
else()
set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "-mfpu=neon")
endif()
endif()
endif()
endfunction()
compile_carotene()
include_directories("${CAROTENE_DIR}/include")
get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS)
set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
if (CMAKE_COMPILER_IS_GNUCXX)
# allow more inlines - these parameters improve performance for:
# matchTemplate about 5-10%
# goodFeaturesToTrack 10-20%
# cornerHarris 30% for some cases
set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
# set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
set_target_properties(tegra_hal PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
if(NOT BUILD_SHARED_LIBS)
ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
endif()
target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE)
set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE)
set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE)
set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE)
configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY)

1851
3rdparty/carotene/hal/tegra_hal.hpp vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,47 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_DEFINITIONS_HPP
#define CAROTENE_DEFINITIONS_HPP
#ifndef CAROTENE_NS
#define CAROTENE_NS carotene
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,125 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_TYPES_HPP
#define CAROTENE_TYPES_HPP
#include <carotene/definitions.hpp>
#include <stdint.h>
#include <cstddef>
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif
namespace CAROTENE_NS {
using std::size_t;
using std::ptrdiff_t;
typedef int8_t s8;
typedef uint8_t u8;
typedef int16_t s16;
typedef uint16_t u16;
typedef int32_t s32;
typedef uint32_t u32;
typedef float f32;
typedef int64_t s64;
typedef uint64_t u64;
typedef double f64;
typedef ptrdiff_t stride_t;
enum CONVERT_POLICY
{
CONVERT_POLICY_WRAP,
CONVERT_POLICY_SATURATE
};
enum BORDER_MODE
{
BORDER_MODE_UNDEFINED,
BORDER_MODE_CONSTANT,
BORDER_MODE_REPLICATE,
BORDER_MODE_REFLECT,
BORDER_MODE_REFLECT101,
BORDER_MODE_WRAP
};
enum FLIP_MODE
{
FLIP_HORIZONTAL_MODE = 1,
FLIP_VERTICAL_MODE = 2,
FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE
};
enum COLOR_SPACE
{
COLOR_SPACE_BT601,
COLOR_SPACE_BT709
};
struct Size2D {
Size2D() : width(0), height(0) {}
Size2D(size_t width_, size_t height_) : width(width_), height(height_) {}
size_t width;
size_t height;
inline size_t total() const
{
return width * height;
}
};
struct Margin {
Margin() : left(0), right(0), top(0), bottom(0) {}
Margin(size_t left_, size_t right_, size_t top_, size_t bottom_)
: left(left_), right(right_), top(top_), bottom(bottom_) {}
// these are measured in elements
size_t left, right, top, bottom;
};
struct KeypointStore {
virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0;
virtual ~KeypointStore() {};
};
}
#endif

241
3rdparty/carotene/src/absdiff.cpp vendored Normal file
View File

@ -0,0 +1,241 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct AbsDiff
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vabdq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vabd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0];
}
};
template <typename T>
struct AbsDiffSigned
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
typename internal::VecTraits<T>::vec128 v_min = internal::vminq(v_src0, v_src1);
typename internal::VecTraits<T>::vec128 v_max = internal::vmaxq(v_src0, v_src1);
v_dst = internal::vqsubq(v_max, v_min);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
typename internal::VecTraits<T>::vec64 v_min = internal::vmin(v_src0, v_src1);
typename internal::VecTraits<T>::vec64 v_max = internal::vmax(v_src0, v_src1);
v_dst = internal::vqsub(v_max, v_min);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]);
}
};
} // namespace
#endif
void absDiff(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<u8>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const u16 *src0Base, ptrdiff_t src0Stride,
const u16 *src1Base, ptrdiff_t src1Stride,
u16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<u16>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s8 *src0Base, ptrdiff_t src0Stride,
const s8 *src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s8>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s16 *src0Base, ptrdiff_t src0Stride,
const s16 *src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s16>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s32 *src0Base, ptrdiff_t src0Stride,
const s32 *src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

408
3rdparty/carotene/src/accumulate.cpp vendored Normal file
View File

@ -0,0 +1,408 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
void accumulate(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j);
int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
v_dst0 = vqaddq_s16(v_dst0, v_src0);
v_dst1 = vqaddq_s16(v_dst1, v_src1);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
int16x8_t v_dst = vld1q_s16(dst + j);
v_dst = vqaddq_s16(v_dst, v_src16);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
void accumulateSquareConst(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
v_srclo = vget_low_s16(v_src1);
v_srchi = vget_high_s16(v_src1);
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
int16x8_t v_dst = vld1q_s16(dst + j);
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
s32 srcVal = src[j];
dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
}
}
}
template <>
void accumulateSquareConst<0>(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
v_srclo = vget_low_s16(v_src1);
v_srchi = vget_high_s16(v_src1);
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
int16x8_t v_dst = vld1q_s16(dst + j);
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
s32 srcVal = src[j];
dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
}
}
}
typedef void (* accumulateSquareConstFunc)(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride);
} // namespace
#endif
void accumulateSquare(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride,
u32 shift)
{
if (shift >= 16)
{
for (size_t i = 0; i < size.height; ++i)
{
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(s16) * size.width);
}
return;
}
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
accumulateSquareConstFunc funcs[16] =
{
accumulateSquareConst<0>,
accumulateSquareConst<1>,
accumulateSquareConst<2>,
accumulateSquareConst<3>,
accumulateSquareConst<4>,
accumulateSquareConst<5>,
accumulateSquareConst<6>,
accumulateSquareConst<7>,
accumulateSquareConst<8>,
accumulateSquareConst<9>,
accumulateSquareConst<10>,
accumulateSquareConst<11>,
accumulateSquareConst<12>,
accumulateSquareConst<13>,
accumulateSquareConst<14>,
accumulateSquareConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
#endif
}
#ifdef CAROTENE_NEON
namespace {
struct AccumulateWeightedHalf
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vhaddq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vhadd_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
}
};
struct AccumulateWeighted
{
typedef u8 type;
float alpha, beta;
float32x4_t v_alpha, v_beta;
explicit AccumulateWeighted(float _alpha) :
alpha(_alpha), beta(1 - _alpha)
{
v_alpha = vdupq_n_f32(alpha);
v_beta = vdupq_n_f32(beta);
}
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
}
void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
uint8x8_t & v_dst) const
{
uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_dst = vmovn_u16(_v_dst);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = beta * src1[0] + alpha * src0[0];
}
};
} // namespace
#endif
void accumulateWeighted(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
u8 *dstBase, ptrdiff_t dstStride,
f32 alpha)
{
if (alpha == 0.0f)
return;
if (alpha == 1.0f)
{
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memcpy(dst, src, sizeof(u8) * size.width);
}
return;
}
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
// in this case we can use the following scheme:
// dst[p] = (src[p] + dst[p]) >> 1
// which is faster
if (alpha == 0.5f)
{
internal::vtransform(size,
srcBase, srcStride,
dstBase, dstStride,
dstBase, dstStride,
AccumulateWeightedHalf());
return;
}
internal::vtransform(size,
srcBase, srcStride,
dstBase, dstStride,
dstBase, dstStride,
AccumulateWeighted(alpha));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)alpha;
#endif
}
} //namespace CAROTENE_NS

475
3rdparty/carotene/src/add.cpp vendored Normal file
View File

@ -0,0 +1,475 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct AddWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
}
};
template <typename T, typename WT>
struct AddSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
}
};
} // namespace
#endif
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u8, u16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u8, u16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (u16)src0[j] + (u16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u16, u32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u16, u32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u32, u64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u32, u64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

265
3rdparty/carotene/src/add_weighted.cpp vendored Normal file
View File

@ -0,0 +1,265 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
using namespace internal;
template <typename T> struct TypeTraits;
template <> struct TypeTraits< u8> { typedef u16 wide; typedef u8 unsign; typedef uint8x16_t vec128; };
template <> struct TypeTraits< s8> { typedef s16 wide; typedef u8 unsign; typedef int8x16_t vec128; };
template <> struct TypeTraits<u16> { typedef u32 wide; typedef u8 narrow; typedef u16 unsign; typedef uint16x8_t vec128; };
template <> struct TypeTraits<s16> { typedef s32 wide; typedef s8 narrow; typedef u16 unsign; typedef int16x8_t vec128; };
template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef uint32x4_t vec128; };
template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef int32x4_t vec128; };
template <> struct TypeTraits<f32> { typedef f64 wide; typedef float32x4_t vec128; };
template <typename T> struct wAdd
{
typedef T type;
f32 alpha, beta, gamma;
typedef typename TypeTraits<T>::wide wtype;
wAdd<wtype> wideAdd;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma),
wideAdd(_alpha, _beta, _gamma) {}
void operator() (const typename VecTraits<T>::vec128 & v_src0,
const typename VecTraits<T>::vec128 & v_src1,
typename VecTraits<T>::vec128 & v_dst) const
{
typename VecTraits<wtype>::vec128 vrl, vrh;
wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);
wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);
v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));
}
void operator() (const typename VecTraits<T>::vec64 & v_src0,
const typename VecTraits<T>::vec64 & v_src1,
typename VecTraits<T>::vec64 & v_dst) const
{
typename VecTraits<wtype>::vec128 vr;
wideAdd(vmovl(v_src0), vmovl(v_src1), vr);
v_dst = vqmovn(vr);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<s32>
{
typedef s32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<s32>::vec128 & v_src0,
const typename VecTraits<s32>::vec128 & v_src1,
typename VecTraits<s32>::vec128 & v_dst) const
{
float32x4_t vs1 = vcvtq_f32_s32(v_src0);
float32x4_t vs2 = vcvtq_f32_s32(v_src1);
vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_s32_f32(vs1);
}
void operator() (const typename VecTraits<s32>::vec64 & v_src0,
const typename VecTraits<s32>::vec64 & v_src1,
typename VecTraits<s32>::vec64 & v_dst) const
{
float32x2_t vs1 = vcvt_f32_s32(v_src0);
float32x2_t vs2 = vcvt_f32_s32(v_src1);
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_s32_f32(vs1);
}
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
{
dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<u32>
{
typedef u32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<u32>::vec128 & v_src0,
const typename VecTraits<u32>::vec128 & v_src1,
typename VecTraits<u32>::vec128 & v_dst) const
{
float32x4_t vs1 = vcvtq_f32_u32(v_src0);
float32x4_t vs2 = vcvtq_f32_u32(v_src1);
vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_u32_f32(vs1);
}
void operator() (const typename VecTraits<u32>::vec64 & v_src0,
const typename VecTraits<u32>::vec64 & v_src1,
typename VecTraits<u32>::vec64 & v_dst) const
{
float32x2_t vs1 = vcvt_f32_u32(v_src0);
float32x2_t vs2 = vcvt_f32_u32(v_src1);
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_u32_f32(vs1);
}
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
{
dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<f32>
{
typedef f32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<f32>::vec128 & v_src0,
const typename VecTraits<f32>::vec128 & v_src1,
typename VecTraits<f32>::vec128 & v_dst) const
{
float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);
v_dst = vmlaq_f32(vs1, v_src1, vbeta);
}
void operator() (const typename VecTraits<f32>::vec64 & v_src0,
const typename VecTraits<f32>::vec64 & v_src1,
typename VecTraits<f32>::vec64 & v_dst) const
{
float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));
v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = alpha*src0[0] + beta*src1[0] + gamma;
}
};
} // namespace
#define IMPL_ADDWEIGHTED(type) \
void addWeighted(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride, \
f32 alpha, f32 beta, f32 gamma) \
{ \
internal::assertSupportedConfiguration(); \
wAdd<type> wgtAdd(alpha, \
beta, \
gamma); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, \
wgtAdd); \
}
#else
#define IMPL_ADDWEIGHTED(type) \
void addWeighted(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t, \
f32, f32, f32) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
IMPL_ADDWEIGHTED(u8)
IMPL_ADDWEIGHTED(s8)
IMPL_ADDWEIGHTED(u16)
IMPL_ADDWEIGHTED(s16)
IMPL_ADDWEIGHTED(u32)
IMPL_ADDWEIGHTED(s32)
IMPL_ADDWEIGHTED(f32)
} // namespace CAROTENE_NS

225
3rdparty/carotene/src/bitwise.cpp vendored Normal file
View File

@ -0,0 +1,225 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
struct BitwiseAnd
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vandq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vand_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] & src1[0];
}
};
struct BitwiseOr
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vorrq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vorr_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] | src1[0];
}
};
struct BitwiseXor
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = veorq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = veor_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] ^ src1[0];
}
};
#endif
void bitwiseNot(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src + j);
uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1);
vst1q_u8(dst + j, v_dst0);
vst1q_u8(dst + j + 16, v_dst1);
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_dst = vmvn_u8(v_src);
vst1_u8(dst + j, v_dst);
}
for (; j < size.width; j++)
{
dst[j] = ~src[j];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseAnd(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseAnd());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseOr(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseOr());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseXor(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseXor());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

1337
3rdparty/carotene/src/blur.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

773
3rdparty/carotene/src/canny.cpp vendored Normal file
View File

@ -0,0 +1,773 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct RowFilter3x3Canny
{
inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr)
{
vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
lookLeft = offsetk - borderxl;
lookRight = offsetk - borderxr;
}
inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width)
{
uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
ptrdiff_t i = 0;
for (; i < width - 8 + lookRight; i += 8)
{
internal::prefetch(src + i);
uint8x8_t l18u = vld1_u8(src + i + 1);
uint8x8_t l2 = l18u;
uint8x8_t l0 = vext_u8(l, l18u, 6);
int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1));
l = l18u;
int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0));
int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0));
int16x8_t ldy = vaddq_s16(l02, l1x2);
vst1q_s16(dstx + i, ldx);
vst1q_s16(dsty + i, ldy);
}
//tail
if (lookRight == 0 || i != width)
{
uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0));
int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
int16x8_t taildy = vqaddq_s16(tail02, tail1x2);
vst1q_s16(dstx + (width - 8), taildx);
vst1q_s16(dsty + (width - 8), taildy);
}
}
uint8x8_t vfmask;
uint8x8_t vtmask;
enum { offsetk = 1};
ptrdiff_t lookLeft;
ptrdiff_t lookRight;
};
template <bool L2gradient>
inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
{
ptrdiff_t j = 0;
for (; j <= width - 8; j += 8)
{
ColFilter3x3CannyL1Loop:
int16x8_t line0x = vld1q_s16(src0 + j);
int16x8_t line1x = vld1q_s16(src1 + j);
int16x8_t line2x = vld1q_s16(src2 + j);
int16x8_t line0y = vld1q_s16(src0 + j + width);
int16x8_t line2y = vld1q_s16(src2 + j + width);
int16x8_t l02 = vaddq_s16(line0x, line2x);
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
int16x8_t dy = vsubq_s16(line2y, line0y);
int16x8_t dx = vaddq_s16(l1x2, l02);
int16x8_t dya = vabsq_s16(dy);
int16x8_t dxa = vabsq_s16(dx);
int16x8_t norm = vaddq_s16(dya, dxa);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s16(dsty + j, dy);
vst1q_s16(dstx + j, dx);
vst1q_s32(mag + j + 4, normh);
vst1q_s32(mag + j, norml);
}
if (j != width)
{
j = width - 8;
goto ColFilter3x3CannyL1Loop;
}
}
template <>
inline void ColFilter3x3Canny<true>(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
{
ptrdiff_t j = 0;
for (; j <= width - 8; j += 8)
{
ColFilter3x3CannyL2Loop:
int16x8_t line0x = vld1q_s16(src0 + j);
int16x8_t line1x = vld1q_s16(src1 + j);
int16x8_t line2x = vld1q_s16(src2 + j);
int16x8_t line0y = vld1q_s16(src0 + j + width);
int16x8_t line2y = vld1q_s16(src2 + j + width);
int16x8_t l02 = vaddq_s16(line0x, line2x);
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
int16x8_t dy = vsubq_s16(line2y, line0y);
int16x8_t dx = vaddq_s16(l1x2, l02);
int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx));
int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy));
norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy));
normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx));
vst1q_s16(dsty + j, dy);
vst1q_s16(dstx + j, dx);
vst1q_s32(mag + j, norml);
vst1q_s32(mag + j + 4, normh);
}
if (j != width)
{
j = width - 8;
goto ColFilter3x3CannyL2Loop;
}
}
template <bool L2gradient>
inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
{
ptrdiff_t j = 0;
if (colscn >= 8)
{
int16x8_t vx = vld1q_s16(_dx);
int16x8_t vy = vld1q_s16(_dy);
for (; j <= colscn - 16; j+=8)
{
internal::prefetch(_dx);
internal::prefetch(_dy);
int16x8_t vx2 = vld1q_s16(_dx + j + 8);
int16x8_t vy2 = vld1q_s16(_dy + j + 8);
int16x8_t vabsx = vabsq_s16(vx);
int16x8_t vabsy = vabsq_s16(vy);
int16x8_t norm = vaddq_s16(vabsx, vabsy);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s32(_norm + j + 4, normh);
vst1q_s32(_norm + j + 0, norml);
vx = vx2;
vy = vy2;
}
int16x8_t vabsx = vabsq_s16(vx);
int16x8_t vabsy = vabsq_s16(vy);
int16x8_t norm = vaddq_s16(vabsx, vabsy);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s32(_norm + j + 4, normh);
vst1q_s32(_norm + j + 0, norml);
}
for (; j < colscn; j++)
_norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j]));
}
template <>
inline void NormCanny<true>(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
{
ptrdiff_t j = 0;
if (colscn >= 8)
{
int16x8_t vx = vld1q_s16(_dx);
int16x8_t vy = vld1q_s16(_dy);
for (; j <= colscn - 16; j+=8)
{
internal::prefetch(_dx);
internal::prefetch(_dy);
int16x8_t vxnext = vld1q_s16(_dx + j + 8);
int16x8_t vynext = vld1q_s16(_dy + j + 8);
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
vst1q_s32(_norm + j + 0, norml);
vst1q_s32(_norm + j + 4, normh);
vx = vxnext;
vy = vynext;
}
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
vst1q_s32(_norm + j + 0, norml);
vst1q_s32(_norm + j + 4, normh);
}
for (; j < colscn; j++)
_norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j];
}
template <bool L2gradient>
inline void prepareThresh(f64 low_thresh, f64 high_thresh,
s32 &low, s32 &high)
{
if (low_thresh > high_thresh)
std::swap(low_thresh, high_thresh);
#if defined __GNUC__
low = (s32)low_thresh;
high = (s32)high_thresh;
low -= (low > low_thresh);
high -= (high > high_thresh);
#else
low = internal::round(low_thresh);
high = internal::round(high_thresh);
f32 ldiff = (f32)(low_thresh - low);
f32 hdiff = (f32)(high_thresh - high);
low -= (ldiff < 0);
high -= (hdiff < 0);
#endif
}
template <>
inline void prepareThresh<true>(f64 low_thresh, f64 high_thresh,
s32 &low, s32 &high)
{
if (low_thresh > high_thresh)
std::swap(low_thresh, high_thresh);
if (low_thresh > 0) low_thresh *= low_thresh;
if (high_thresh > 0) high_thresh *= high_thresh;
#if defined __GNUC__
low = (s32)low_thresh;
high = (s32)high_thresh;
low -= (low > low_thresh);
high -= (high > high_thresh);
#else
low = internal::round(low_thresh);
high = internal::round(high_thresh);
f32 ldiff = (f32)(low_thresh - low);
f32 hdiff = (f32)(high_thresh - high);
low -= (ldiff < 0);
high -= (hdiff < 0);
#endif
}
template <bool L2gradient, bool externalSobel>
struct _normEstimator
{
ptrdiff_t magstep;
ptrdiff_t dxOffset;
ptrdiff_t dyOffset;
ptrdiff_t shxOffset;
ptrdiff_t shyOffset;
std::vector<u8> buffer;
const ptrdiff_t offsetk;
ptrdiff_t borderyt, borderyb;
RowFilter3x3Canny sobelRow;
inline _normEstimator(const Size2D &size, s32, Margin borderMargin,
ptrdiff_t &mapstep, s32** mag_buf, u8* &map):
offsetk(1),
sobelRow(std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left),
std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right))
{
mapstep = size.width + 2;
magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32));
dxOffset = mapstep * sizeof(s32)/sizeof(s16);
dyOffset = dxOffset + size.width * 1;
shxOffset = dxOffset + size.width * 2;
shyOffset = dxOffset + size.width * 3;
buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) );
mag_buf[0] = (s32*)&buffer[0];
mag_buf[1] = mag_buf[0] + magstep;
mag_buf[2] = mag_buf[1] + magstep;
memset(mag_buf[0], 0, mapstep * sizeof(s32));
map = (u8*)(mag_buf[2] + magstep);
memset(map, 1, mapstep);
memset(map + mapstep*(size.height + 1), 1, mapstep);
borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
}
inline void firstRow(const Size2D &size, s32,
const u8 *srcBase, ptrdiff_t srcStride,
s16*, ptrdiff_t,
s16*, ptrdiff_t,
s32** mag_buf)
{
//sobelH row #0
const u8* _src = internal::getRowPtr(srcBase, srcStride, 0);
sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width);
//sobelH row #1
_src = internal::getRowPtr(srcBase, srcStride, 1);
sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width);
mag_buf[1][0] = mag_buf[1][size.width+1] = 0;
if (borderyt == 0)
{
//sobelH row #-1
_src = internal::getRowPtr(srcBase, srcStride, -1);
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
}
else
{
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
}
}
inline void nextRow(const Size2D &size, s32,
const u8 *srcBase, ptrdiff_t srcStride,
s16*, ptrdiff_t,
s16*, ptrdiff_t,
const ptrdiff_t &mapstep, s32** mag_buf,
size_t i, const s16* &_x, const s16* &_y)
{
mag_buf[2][0] = mag_buf[2][size.width+1] = 0;
if (i < size.height - borderyb)
{
const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1);
//sobelH row #i+1
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset,
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
}
else if (i < size.height)
{
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
}
else
memset(mag_buf[2], 0, mapstep*sizeof(s32));
_x = ((s16*)mag_buf[1]) + dxOffset;
_y = ((s16*)mag_buf[1]) + dyOffset;
}
};
template <bool L2gradient>
struct _normEstimator<L2gradient, true>
{
std::vector<u8> buffer;
inline _normEstimator(const Size2D &size, s32 cn, Margin,
ptrdiff_t &mapstep, s32** mag_buf, u8* &map)
{
mapstep = size.width + 2;
buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) );
mag_buf[0] = (s32*)&buffer[0];
mag_buf[1] = mag_buf[0] + mapstep*cn;
mag_buf[2] = mag_buf[1] + mapstep*cn;
memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32));
map = (u8*)(mag_buf[2] + mapstep*cn);
memset(map, 1, mapstep);
memset(map + mapstep*(size.height + 1), 1, mapstep);
}
inline void firstRow(const Size2D &size, s32 cn,
const u8 *, ptrdiff_t,
s16* dxBase, ptrdiff_t dxStride,
s16* dyBase, ptrdiff_t dyStride,
s32** mag_buf)
{
s32* _norm = mag_buf[1] + 1;
s16* _dx = internal::getRowPtr(dxBase, dxStride, 0);
s16* _dy = internal::getRowPtr(dyBase, dyStride, 0);
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
if(cn > 1)
{
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
{
size_t maxIdx = jn;
for(s32 k = 1; k < cn; ++k)
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
_norm[j] = _norm[maxIdx];
_dx[j] = _dx[maxIdx];
_dy[j] = _dy[maxIdx];
}
}
_norm[-1] = _norm[size.width] = 0;
}
inline void nextRow(const Size2D &size, s32 cn,
const u8 *, ptrdiff_t,
s16* dxBase, ptrdiff_t dxStride,
s16* dyBase, ptrdiff_t dyStride,
const ptrdiff_t &mapstep, s32** mag_buf,
size_t i, const s16* &_x, const s16* &_y)
{
s32* _norm = mag_buf[(i > 0) + 1] + 1;
if (i < size.height)
{
s16* _dx = internal::getRowPtr(dxBase, dxStride, i);
s16* _dy = internal::getRowPtr(dyBase, dyStride, i);
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
if(cn > 1)
{
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
{
size_t maxIdx = jn;
for(s32 k = 1; k < cn; ++k)
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
_norm[j] = _norm[maxIdx];
_dx[j] = _dx[maxIdx];
_dy[j] = _dy[maxIdx];
}
}
_norm[-1] = _norm[size.width] = 0;
}
else
memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32));
_x = internal::getRowPtr(dxBase, dxStride, i-1);
_y = internal::getRowPtr(dyBase, dyStride, i-1);
}
};
template <bool L2gradient, bool externalSobel>
inline void Canny3x3(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
s32 low, high;
prepareThresh<L2gradient>(low_thresh, high_thresh, low, high);
ptrdiff_t mapstep;
s32* mag_buf[3];
u8* map;
_normEstimator<L2gradient, externalSobel> normEstimator(size, cn, borderMargin, mapstep, mag_buf, map);
size_t maxsize = std::max<size_t>( 1u << 10, size.width * size.height / 10 );
std::vector<u8*> stack( maxsize );
u8 **stack_top = &stack[0];
u8 **stack_bottom = &stack[0];
/* sector numbers
(Top-Left Origin)
1 2 3
* * *
* * *
0*******0
* * *
* * *
3 2 1
*/
#define CANNY_PUSH(d) *(d) = u8(2), *stack_top++ = (d)
#define CANNY_POP(d) (d) = *--stack_top
//i == 0
normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf);
// calculate magnitude and angle of gradient, perform non-maxima supression.
// fill the map with one of the following values:
// 0 - the pixel might belong to an edge
// 1 - the pixel can not belong to an edge
// 2 - the pixel does belong to an edge
for (size_t i = 1; i <= size.height; i++)
{
const s16 *_x, *_y;
normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y);
u8* _map = map + mapstep*i + 1;
_map[-1] = _map[size.width] = 1;
s32* _mag = mag_buf[1] + 1; // take the central row
ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
if ((stack_top - stack_bottom) + size.width > maxsize)
{
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
maxsize = maxsize * 3/2;
stack.resize(maxsize);
stack_bottom = &stack[0];
stack_top = stack_bottom + sz;
}
s32 prev_flag = 0;
for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++)
{
#define CANNY_SHIFT 15
const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
s32 m = _mag[j];
if (m > low)
{
s32 xs = _x[j];
s32 ys = _y[j];
s32 x = abs(xs);
s32 y = abs(ys) << CANNY_SHIFT;
s32 tg22x = x * TG22;
if (y < tg22x)
{
if (m > _mag[j-1] && m >= _mag[j+1]) goto __push;
}
else
{
s32 tg67x = tg22x + (x << (CANNY_SHIFT+1));
if (y > tg67x)
{
if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push;
}
else
{
s32 s = (xs ^ ys) < 0 ? -1 : 1;
if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push;
}
}
}
prev_flag = 0;
_map[j] = u8(1);
continue;
__push:
if (!prev_flag && m > high && _map[j-mapstep] != 2)
{
CANNY_PUSH(_map + j);
prev_flag = 1;
}
else
_map[j] = 0;
}
// scroll the ring buffer
_mag = mag_buf[0];
mag_buf[0] = mag_buf[1];
mag_buf[1] = mag_buf[2];
mag_buf[2] = _mag;
}
// now track the edges (hysteresis thresholding)
while (stack_top > stack_bottom)
{
u8* m;
if ((size_t)(stack_top - stack_bottom) + 8u > maxsize)
{
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
maxsize = maxsize * 3/2;
stack.resize(maxsize);
stack_bottom = &stack[0];
stack_top = stack_bottom + sz;
}
CANNY_POP(m);
if (!m[-1]) CANNY_PUSH(m - 1);
if (!m[1]) CANNY_PUSH(m + 1);
if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
if (!m[-mapstep]) CANNY_PUSH(m - mapstep);
if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
if (!m[mapstep-1]) CANNY_PUSH(m + mapstep - 1);
if (!m[mapstep]) CANNY_PUSH(m + mapstep);
if (!m[mapstep+1]) CANNY_PUSH(m + mapstep + 1);
}
// the final pass, form the final image
uint8x16_t v2 = vmovq_n_u8(2);
const u8* ptrmap = map + mapstep + 1;
for (size_t i = 0; i < size.height; i++, ptrmap += mapstep)
{
u8* _dst = internal::getRowPtr(dstBase, dstStride, i);
ptrdiff_t j = 0;
for (; j < (ptrdiff_t)size.width - 16; j += 16)
{
internal::prefetch(ptrmap);
uint8x16_t vmap = vld1q_u8(ptrmap + j);
uint8x16_t vdst = vceqq_u8(vmap, v2);
vst1q_u8(_dst+j, vdst);
}
for (; j < (ptrdiff_t)size.width; j++)
_dst[j] = (u8)-(ptrmap[j] >> 1);
}
}
} // namespace
#endif
bool isCanny3x3Supported(const Size2D &size)
{
return isSupportedConfiguration() &&
size.height >= 2 && size.width >= 9;
}
void Canny3x3L1(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
#ifdef CAROTENE_NEON
Canny3x3<false, false>(size, 1,
srcBase, srcStride,
dstBase, dstStride,
NULL, 0,
NULL, 0,
low_thresh, high_thresh,
borderMargin);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)low_thresh;
(void)high_thresh;
(void)borderMargin;
#endif
}
void Canny3x3L2(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
#ifdef CAROTENE_NEON
Canny3x3<true, false>(size, 1,
srcBase, srcStride,
dstBase, dstStride,
NULL, 0,
NULL, 0,
low_thresh, high_thresh,
borderMargin);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)low_thresh;
(void)high_thresh;
(void)borderMargin;
#endif
}
void Canny3x3L1(const Size2D &size, s32 cn,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Canny3x3<false, true>(size, cn,
NULL, 0,
dstBase, dstStride,
dxBase, dxStride,
dyBase, dyStride,
low_thresh, high_thresh,
Margin());
#else
(void)size;
(void)cn;
(void)dstBase;
(void)dstStride;
(void)dxBase;
(void)dxStride;
(void)dyBase;
(void)dyStride;
(void)low_thresh;
(void)high_thresh;
#endif
}
void Canny3x3L2(const Size2D &size, s32 cn,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Canny3x3<true, true>(size, cn,
NULL, 0,
dstBase, dstStride,
dxBase, dxStride,
dyBase, dyStride,
low_thresh, high_thresh,
Margin());
#else
(void)size;
(void)cn;
(void)dstBase;
(void)dstStride;
(void)dxBase;
(void)dxStride;
(void)dyBase;
(void)dyStride;
(void)low_thresh;
(void)high_thresh;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,486 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
void extract2(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 64, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x2_t v_src = vld2q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld2q_u8(src + sj + 32);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 16, dj += 8)
{
uint8x8x2_t v_src = vld2_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 2, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
void extract3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 96, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x3_t v_src = vld3q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld3q_u8(src + sj + 48);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 24, dj += 8)
{
uint8x8x3_t v_src = vld3_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 3, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
void extract4(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 128, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x4_t v_src = vld4q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld4q_u8(src + sj + 64);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 32, dj += 8)
{
uint8x8x4_t v_src = vld4_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 4, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
#define FILL_LINES2(macro,type) \
macro##_LINE(type,0) \
macro##_LINE(type,1)
#define FILL_LINES3(macro,type) \
FILL_LINES2(macro,type) \
macro##_LINE(type,2)
#define FILL_LINES4(macro,type) \
FILL_LINES3(macro,type) \
macro##_LINE(type,3)
#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
#ifdef CAROTENE_NEON
#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
#define SST_LINE(type, n) dst##n[dj] = src[sj + n];
#define MUL2(val) (val << 1)
#define MUL3(val) (MUL2(val) + val)
#define MUL4(val) (val << 2)
#define CONTDST2 srcStride == dst0Stride && \
srcStride == dst1Stride &&
#define CONTDST3 srcStride == dst0Stride && \
srcStride == dst1Stride && \
srcStride == dst2Stride &&
#define CONTDST4 srcStride == dst0Stride && \
srcStride == dst1Stride && \
srcStride == dst2Stride && \
srcStride == dst3Stride &&
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT_ASM2(sgn, bits) __asm__ ( \
"vld2." #bits " {d0, d2}, [%[in0]] \n\t" \
"vld2." #bits " {d1, d3}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3" \
);
#define SPLIT_ASM3(sgn, bits) __asm__ ( \
"vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \
"vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5" \
);
#define SPLIT_ASM4(sgn, bits) __asm__ ( \
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
"vst1." #bits " {d6-d7}, [%[out3]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
);
#define SPLIT_QUAD(sgn, bits, n) { \
internal::prefetch(src + sj); \
SPLIT_ASM##n(sgn, bits) \
}
#else
#define SPLIT_QUAD(sgn, bits, n) { \
internal::prefetch(src + sj); \
vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
FILL_LINES##n(VST1Q, sgn##bits) \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \
const sgn##bits * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##bits) ) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTDST##n \
dst0Stride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
FILL_LINES##n(VROW, sgn##bits) \
size_t sj = 0u, dj = 0u; \
\
for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \
SPLIT_QUAD(sgn, bits, n) \
\
if (dj < roiw8) \
{ \
vec64 v_src = vld##n##_##sgn##bits(src + sj); \
FILL_LINES##n(VST1, sgn##bits) \
sj += MUL##n(8)/sizeof(sgn##bits); \
dj += 8/sizeof(sgn##bits); \
} \
\
for (; dj < size.width; sj += n, ++dj) \
{ \
FILL_LINES##n(SST, sgn##bits) \
} \
} \
}
#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \
const sgn##64 * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##64) ) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTDST##n \
dst0Stride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \
FILL_LINES##n(VROW, sgn##64) \
size_t sj = 0u, dj = 0u; \
\
for (; dj < size.width; sj += n, ++dj) \
{ \
vec64 v_src = vld##n##_##sgn##64(src + sj); \
FILL_LINES##n(VST1, sgn##64) \
} \
} \
}
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define ALPHA_QUAD(sgn, bits) { \
internal::prefetch(src + sj); \
__asm__ ( \
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
"vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \
"vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \
"vst1." #bits " {d6-d7}, [%[out1]] \n\t" \
: \
: [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
[in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
); \
}
#else
#define ALPHA_QUAD(sgn, bits) { \
internal::prefetch(src + sj); \
union { vec128_4 v4; vec128_3 v3; } vals; \
vals.v4 = vld4q_##sgn##bits(src + sj); \
vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \
const sgn##bits * srcBase, ptrdiff_t srcStride, \
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (srcStride == dst3Stride && \
srcStride == dst1Stride && \
srcStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4; \
typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4; \
typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \
sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \
size_t sj = 0u, d3j = 0u, d1j = 0u; \
\
for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \
d1j += 16/sizeof(sgn##bits)) \
ALPHA_QUAD(sgn, bits) \
\
if (d1j < roiw8) \
{ \
union { vec64_4 v4; vec64_3 v3; } vals; \
vals.v4 = vld4_##sgn##bits(src + sj); \
vst3_u8(dst3 + d3j, vals.v3); \
vst1_u8(dst1 + d1j, vals.v4.val[3]); \
sj += MUL4(8)/sizeof(sgn##bits); \
d3j += MUL3(8)/sizeof(sgn##bits); \
d1j += 8/sizeof(sgn##bits); \
} \
\
for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \
{ \
dst3[d3j+0] = src[sj + 0]; \
dst3[d3j+1] = src[sj + 1]; \
dst3[d3j+2] = src[sj + 2]; \
dst1[d1j] = src[sj + 3]; \
} \
} \
}
#else
#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \
const sgn##bits * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##bits) ) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)srcBase; \
(void)srcStride; \
FILL_LINES##n(VOID, sgn##bits) \
}
#define SPLIT64(sgn,n) SPLIT(sgn,64,n)
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \
const sgn##bits * srcBase, ptrdiff_t srcStride, \
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)srcBase; \
(void)srcStride; \
(void)dst3Base; \
(void)dst3Stride; \
(void)dst1Base; \
(void)dst1Stride; \
}
#endif //CAROTENE_NEON
SPLIT(u, 8,2)
SPLIT(u, 8,3)
SPLIT(u, 8,4)
SPLIT(u,16,2)
SPLIT(u,16,3)
SPLIT(u,16,4)
SPLIT(s,32,2)
SPLIT(s,32,3)
SPLIT(s,32,4)
SPLIT64(s, 2)
SPLIT64(s, 3)
SPLIT64(s, 4)
SPLIT4ALPHA(u,8)
} // namespace CAROTENE_NS

View File

@ -0,0 +1,389 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#define FILL_LINES2(macro,type) \
macro##_LINE(type,0) \
macro##_LINE(type,1)
#define FILL_LINES3(macro,type) \
FILL_LINES2(macro,type) \
macro##_LINE(type,2)
#define FILL_LINES4(macro,type) \
FILL_LINES3(macro,type) \
macro##_LINE(type,3)
#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
#ifdef CAROTENE_NEON
#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
#define PREF_LINE(type, n) internal::prefetch(src##n + sj);
#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
#define SLD_LINE(type, n) dst[dj + n] = src##n[sj];
#define MUL2(val) (val << 1)
#define MUL3(val) (MUL2(val) + val)
#define MUL4(val) (val << 2)
#define CONTSRC2 dstStride == src0Stride && \
dstStride == src1Stride &&
#define CONTSRC3 dstStride == src0Stride && \
dstStride == src1Stride && \
dstStride == src2Stride &&
#define CONTSRC4 dstStride == src0Stride && \
dstStride == src1Stride && \
dstStride == src2Stride && \
dstStride == src3Stride &&
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define MERGE_ASM2(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vst2." #bits " {d0, d2}, [%[out0]] \n\t" \
"vst2." #bits " {d1, d3}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3" \
);
#define MERGE_ASM3(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
"vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \
"vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5" \
);
#define MERGE_ASM4(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
"vld1." #bits " {d6-d7}, [%[in3]] \n\t" \
"vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \
"vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
);
#define MERGE_QUAD(sgn, bits, n) { \
FILL_LINES##n(PREF, sgn##bits) \
MERGE_ASM##n(sgn, bits) \
}
#else
#define MERGE_QUAD(sgn, bits, n) { \
vec128 v_dst; \
/*FILL_LINES##n(PREF, sgn##bits) \
FILL_LINES##n(VLD1Q, sgn##bits)*/ \
FILL_LINES##n(PRLD, sgn##bits) \
vst##n##q_##sgn##bits(dst + dj, v_dst); \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \
FILL_LINES##n(FARG, sgn##bits), \
sgn##bits * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTSRC##n \
dstStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
FILL_LINES##n(VROW, sgn##bits) \
sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \
size_t sj = 0u, dj = 0u; \
\
for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \
MERGE_QUAD(sgn, bits, n) \
\
if ( sj < roiw8 ) \
{ \
vec64 v_dst; \
FILL_LINES##n(VLD1, sgn##bits) \
vst##n##_##sgn##bits(dst + dj, v_dst); \
sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \
} \
\
for (; sj < size.width; ++sj, dj += n) \
{ \
FILL_LINES##n(SLD, sgn##bits) \
} \
} \
}
#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \
FILL_LINES##n(FARG, sgn##64), \
sgn##64 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTSRC##n \
dstStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
FILL_LINES##n(VROW, sgn##64) \
sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \
size_t sj = 0u, dj = 0u; \
\
for (; sj < size.width; ++sj, dj += n) \
{ \
vec64 v_dst; \
FILL_LINES##n(VLD1, sgn##64) \
vst##n##_##sgn##64(dst + dj, v_dst); \
/*FILL_LINES##n(SLD, sgn##64)*/ \
} \
} \
}
#else
#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \
FILL_LINES##n(FARG, sgn##bits), \
sgn##bits * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
FILL_LINES##n(VOID, sgn##bits) \
(void)dstBase; \
(void)dstStride; \
}
#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
#endif //CAROTENE_NEON
COMBINE(u, 8,2)
COMBINE(u, 8,3)
COMBINE(u, 8,4)
COMBINE(u,16,2)
COMBINE(u,16,3)
COMBINE(u,16,4)
COMBINE(s,32,2)
COMBINE(s,32,3)
COMBINE(s,32,4)
COMBINE64(s, 2)
COMBINE64(s, 3)
COMBINE64(s, 4)
void combineYUYV(const Size2D &size,
const u8 * srcyBase, ptrdiff_t srcyStride,
const u8 * srcuBase, ptrdiff_t srcuStride,
const u8 * srcvBase, ptrdiff_t srcvStride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; i += 1)
{
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t syj = 0u, sj = 0u, dj = 0u;
#ifndef ANDROID
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
{
internal::prefetch(srcy + syj);
internal::prefetch(srcu + sj);
internal::prefetch(srcv + sj);
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
uint8x16x4_t v_dst;
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1q_u8(srcu + sj);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1q_u8(srcv + sj);
vst4q_u8(dst + dj, v_dst);
v_y = vld2q_u8(srcy + syj + 32);
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1q_u8(srcu + sj + 16);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1q_u8(srcv + sj + 16);
vst4q_u8(dst + dj + 64, v_dst);
}
#endif
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
{
uint8x8x2_t v_y = vld2_u8(srcy + syj);
uint8x8x4_t v_dst;
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1_u8(srcu + sj);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1_u8(srcv + sj);
vst4_u8(dst + dj, v_dst);
}
for (; sj < size.width; ++sj, syj += 2, dj += 4)
{
dst[dj] = srcy[syj];
dst[dj + 1] = srcu[sj];
dst[dj + 2] = srcy[syj + 1];
dst[dj + 3] = srcv[sj];
}
}
#else
(void)size;
(void)srcyBase;
(void)srcyStride;
(void)srcuBase;
(void)srcuStride;
(void)srcvBase;
(void)srcvStride;
(void)dstBase;
(void)dstStride;
#endif
}
void combineUYVY(const Size2D &size,
const u8 * srcyBase, ptrdiff_t srcyStride,
const u8 * srcuBase, ptrdiff_t srcuStride,
const u8 * srcvBase, ptrdiff_t srcvStride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t syj = 0u, sj = 0u, dj = 0u;
#ifndef ANDROID
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
{
internal::prefetch(srcy + syj);
internal::prefetch(srcu + sj);
internal::prefetch(srcv + sj);
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
uint8x16x4_t v_dst;
v_dst.val[0] = vld1q_u8(srcu + sj);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1q_u8(srcv + sj);
v_dst.val[3] = v_y.val[1];
vst4q_u8(dst + dj, v_dst);
v_y = vld2q_u8(srcy + syj + 32);
v_dst.val[0] = vld1q_u8(srcu + sj + 16);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1q_u8(srcv + sj + 16);
v_dst.val[3] = v_y.val[1];
vst4q_u8(dst + dj + 64, v_dst);
}
#endif
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
{
uint8x8x2_t v_y = vld2_u8(srcy + syj);
uint8x8x4_t v_dst;
v_dst.val[0] = vld1_u8(srcu + sj);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1_u8(srcv + sj);
v_dst.val[3] = v_y.val[1];
vst4_u8(dst + dj, v_dst);
}
for (; sj < size.width; ++sj, syj += 2, dj += 4)
{
dst[dj] = srcu[sj];
dst[dj + 1] = srcy[syj];
dst[dj + 2] = srcv[sj];
dst[dj + 3] = srcy[syj + 1];
}
}
#else
(void)size;
(void)srcyBase;
(void)srcyStride;
(void)srcuBase;
(void)srcuStride;
(void)srcvBase;
(void)srcvStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

340
3rdparty/carotene/src/cmp.cpp vendored Normal file
View File

@ -0,0 +1,340 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename Op, int elsize> struct vtail
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
//do nothing since there couldn't be enough data
(void)src0;
(void)src1;
(void)dst;
(void)op;
(void)x;
(void)width;
}
};
template <typename Op> struct vtail<Op, 2>
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 v_src0, v_src1;
uvec128 v_dst;
v_src0 = internal::vld1q(src0 + x);
v_src1 = internal::vld1q(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1(dst + x, internal::vmovn(v_dst));
x+=8;
}
}
};
template <typename Op> struct vtail<Op, 1>
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<type>::vec64 vec64;
typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 v_src0, v_src1;
uvec128 v_dst;
v_src0 = internal::vld1q(src0 + x);
v_src1 = internal::vld1q(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1q(dst + x, v_dst);
x+=16;
}
if( x + 8 < width)
{
vec64 v_src0, v_src1;
uvec64 v_dst;
v_src0 = internal::vld1(src0 + x);
v_src1 = internal::vld1(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1(dst + x, v_dst);
x+=8;
}
}
};
template <typename Op>
void vcompare(Size2D size,
const typename Op::type * src0Base, ptrdiff_t src0Stride,
const typename Op::type * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride, const Op & op)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
if (src0Stride == src1Stride && src0Stride == dstStride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
{
size.width *= size.height;
size.height = 1;
}
const u32 step_base = 32 / sizeof(type);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
for (size_t y = 0; y < size.height; ++y)
{
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for( ; x < roiw_base; x += step_base )
{
internal::prefetch(src0 + x);
internal::prefetch(src1 + x);
vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
uvec128 v_dst0;
uvec128 v_dst1;
op(v_src00, v_src10, v_dst0);
op(v_src01, v_src11, v_dst1);
vnst(dst + x, v_dst0, v_dst1);
}
vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);
for (; x < size.width; ++x)
{
op(src0 + x, src1 + x, dst + x);
}
}
}
template<typename T>
struct OpCmpEQ
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vceqq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vceq(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] == src1[0] ? 255 : 0;
}
};
template<typename T>
struct OpCmpNE
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] == src1[0] ? 0 : 255;
}
};
template<typename T>
struct OpCmpGT
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vcgtq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vcgt(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] > src1[0] ? 255 : 0;
}
};
template<typename T>
struct OpCmpGE
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vcgeq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vcge(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] >= src1[0] ? 255 : 0;
}
};
}
#define IMPL_CMPOP(op, type) \
void cmp##op(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
u8 *dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
vcompare(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, \
OpCmp##op<type>()); \
}
#else
#define IMPL_CMPOP(op, type) \
void cmp##op(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
u8 *dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)src0Base; \
(void)src0Stride; \
(void)src1Base; \
(void)src1Stride; \
(void)dstBase; \
(void)dstStride; \
}
#endif
IMPL_CMPOP(EQ, u8)
IMPL_CMPOP(EQ, s8)
IMPL_CMPOP(EQ, u16)
IMPL_CMPOP(EQ, s16)
IMPL_CMPOP(EQ, u32)
IMPL_CMPOP(EQ, s32)
IMPL_CMPOP(EQ, f32)
IMPL_CMPOP(NE, u8)
IMPL_CMPOP(NE, s8)
IMPL_CMPOP(NE, u16)
IMPL_CMPOP(NE, s16)
IMPL_CMPOP(NE, u32)
IMPL_CMPOP(NE, s32)
IMPL_CMPOP(NE, f32)
IMPL_CMPOP(GT, u8)
IMPL_CMPOP(GT, s8)
IMPL_CMPOP(GT, u16)
IMPL_CMPOP(GT, s16)
IMPL_CMPOP(GT, u32)
IMPL_CMPOP(GT, s32)
IMPL_CMPOP(GT, f32)
IMPL_CMPOP(GE, u8)
IMPL_CMPOP(GE, s8)
IMPL_CMPOP(GE, u16)
IMPL_CMPOP(GE, s16)
IMPL_CMPOP(GE, u32)
IMPL_CMPOP(GE, s32)
IMPL_CMPOP(GE, f32)
} // namespace CAROTENE_NS

2846
3rdparty/carotene/src/colorconvert.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

108
3rdparty/carotene/src/common.cpp vendored Normal file
View File

@ -0,0 +1,108 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <cstdlib>
#include <iostream>
#include "common.hpp"
namespace CAROTENE_NS {
bool isSupportedConfiguration()
{
#ifdef CAROTENE_NEON
return true;
#else
return false;
#endif
}
namespace internal {
void assertSupportedConfiguration(bool parametersSupported)
{
if (!isSupportedConfiguration()) {
std::cerr << "internal error: attempted to use an unavailable function" << std::endl;
std::abort();
}
if (!parametersSupported) {
std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl;
std::abort();
}
}
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin)
{
ptrdiff_t p = _p + (ptrdiff_t)startMargin;
size_t len = _len + startMargin + endMargin;
if( (size_t)p < len )
return _p;
else if( borderType == BORDER_MODE_REPLICATE )
p = p < 0 ? 0 : (ptrdiff_t)len - 1;
else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 )
{
s32 delta = borderType == BORDER_MODE_REFLECT101;
if( len == 1 )
return 0;
do
{
if( p < 0 )
p = -p - 1 + delta;
else
p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta;
}
while( (size_t)p >= len );
}
else if( borderType == BORDER_MODE_WRAP )
{
if( p < 0 )
p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len;
if( p >= (ptrdiff_t)len )
p %= (ptrdiff_t)len;
}
else if( borderType == BORDER_MODE_CONSTANT )
p = -1;
else
internal::assertSupportedConfiguration(false);
return p - (ptrdiff_t)startMargin;
}
} // namespace internal
} // namespace CAROTENE_NS

96
3rdparty/carotene/src/common.hpp vendored Normal file
View File

@ -0,0 +1,96 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_COMMON_HPP
#define CAROTENE_SRC_COMMON_HPP
#include <cstddef>
#include <algorithm>
#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON)
#define CAROTENE_NEON
#endif
#ifdef CAROTENE_NEON
#include <arm_neon.h>
#include "intrinsics.hpp"
#endif
#include <carotene/functions.hpp>
#include "saturate_cast.hpp"
namespace CAROTENE_NS { namespace internal {
inline void prefetch(const void *ptr, size_t offset = 32*10)
{
#if defined __GNUC__
__builtin_prefetch(reinterpret_cast<const char*>(ptr) + offset);
#elif defined _MSC_VER && defined CAROTENE_NEON
__prefetch(reinterpret_cast<const char*>(ptr) + offset);
#else
(void)ptr;
(void)offset;
#endif
}
template <typename T>
inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row)
{
char *baseRaw = const_cast<char *>(reinterpret_cast<const char *>(base));
return reinterpret_cast<T *>(baseRaw + ptrdiff_t(row) * stride);
}
void assertSupportedConfiguration(bool parametersSupported = true);
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0);
/*!
* Aligns pointer by the certain number of bytes
*
* This small inline function aligns the pointer by the certain number of bytes by shifting
* it forward by 0 or a positive offset.
*/
template<typename T> inline T* alignPtr(T* ptr, size_t n=sizeof(T))
{
return (T*)(((size_t)ptr + n-1) & -n);
}
}}
#endif

1331
3rdparty/carotene/src/convert.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

399
3rdparty/carotene/src/convert_depth.cpp vendored Normal file
View File

@ -0,0 +1,399 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <int shift>
void lshiftConst(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
}
for (; j < size.width; j++)
{
dst[j] = ((s16)src[j] << shift);
}
}
}
template <>
void lshiftConst<0>(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
dst[j] = (s16)src[j];
}
}
}
template <int shift>
void rshiftConst(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
vqmovun_s16(v_src1));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
vst1_u8(dst + j, vqmovun_s16(v_src));
}
for (; j < size.width; j++)
{
dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
vmovn_s16(v_src1));
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
}
for (; j < size.width; j++)
{
dst[j] = (u8)((src[j] >> shift));
}
}
}
}
template <>
void rshiftConst<0>(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vqmovun_s16(v_src));
}
for (; j < size.width; j++)
{
dst[j] = internal::saturate_cast<u8>(src[j]);
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
}
for (; j < size.width; j++)
{
dst[j] = (u8)src[j];
}
}
}
}
typedef void (* lshiftConstFunc)(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride);
typedef void (* rshiftConstFunc)(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy);
} // namespace
#endif
void lshift(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
u32 shift)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (shift >= 16u)
{
for (size_t i = 0; i < size.height; ++i)
{
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(s16) * size.width);
}
return;
}
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
lshiftConstFunc funcs[16] =
{
lshiftConst<0>,
lshiftConst<1>,
lshiftConst<2>,
lshiftConst<3>,
lshiftConst<4>,
lshiftConst<5>,
lshiftConst<6>,
lshiftConst<7>,
lshiftConst<8>,
lshiftConst<9>,
lshiftConst<10>,
lshiftConst<11>,
lshiftConst<12>,
lshiftConst<13>,
lshiftConst<14>,
lshiftConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
#endif
}
void rshift(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 shift, CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (shift >= 16)
{
if (cpolicy == CONVERT_POLICY_WRAP)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_zero = vdupq_n_s16(0);
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
vmovn_u16(vcltq_s16(v_src1, v_zero)));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
}
for (; j < size.width; j++)
{
dst[j] = src[j] >= 0 ? 0 : 255;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(u8) * size.width);
}
}
return;
}
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
rshiftConstFunc funcs[16] =
{
rshiftConst<0>,
rshiftConst<1>,
rshiftConst<2>,
rshiftConst<3>,
rshiftConst<4>,
rshiftConst<5>,
rshiftConst<6>,
rshiftConst<7>,
rshiftConst<8>,
rshiftConst<9>,
rshiftConst<10>,
rshiftConst<11>,
rshiftConst<12>,
rshiftConst<13>,
rshiftConst<14>,
rshiftConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
(void)cpolicy;
#endif
}
} // namespace CAROTENE_NS

2498
3rdparty/carotene/src/convert_scale.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

340
3rdparty/carotene/src/convolution.cpp vendored Normal file
View File

@ -0,0 +1,340 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
namespace CAROTENE_NS {
bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE) &&
(ksize.width == 3) && (ksize.height == 3);
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
int32x4_t vshrq_s32(int32x4_t value)
{
return vshrq_n_s32(value, shift);
}
template <>
int32x4_t vshrq_s32<0>(int32x4_t value)
{
return value;
}
} // namespace
typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
#endif
void convolution(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue,
const Size2D & ksize, s16 * kernelBase, u32 scale)
{
internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
#ifdef CAROTENE_NEON
const uint8x8_t v_zero_u8 = vdup_n_u8(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
const int32x4_t v_zero_s32 = vdupq_n_s32(0);
uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
static const vshrq_s32_func vshrq_s32_a[33] =
{
vshrq_s32<0>,
vshrq_s32<1>,
vshrq_s32<2>,
vshrq_s32<3>,
vshrq_s32<4>,
vshrq_s32<5>,
vshrq_s32<6>,
vshrq_s32<7>,
vshrq_s32<8>,
vshrq_s32<9>,
vshrq_s32<10>,
vshrq_s32<11>,
vshrq_s32<12>,
vshrq_s32<13>,
vshrq_s32<14>,
vshrq_s32<15>,
vshrq_s32<16>,
vshrq_s32<17>,
vshrq_s32<18>,
vshrq_s32<19>,
vshrq_s32<20>,
vshrq_s32<21>,
vshrq_s32<22>,
vshrq_s32<23>,
vshrq_s32<24>,
vshrq_s32<25>,
vshrq_s32<26>,
vshrq_s32<27>,
vshrq_s32<28>,
vshrq_s32<29>,
vshrq_s32<30>,
vshrq_s32<31>,
vshrq_s32<32>
};
vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx[3] = { 0, 0, 0 },
currx[3] = { 0, 0, 0 },
nextx[3] = { 0, 0, 0 };
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx[0] = prevx[1] = prevx[2] = borderValue;
else
{
prevx[0] = srow0 ? srow0[x4] : borderValue;
prevx[1] = srow1[x4] ;
prevx[2] = srow2 ? srow2[x4] : borderValue;
}
currx[0] = srow0 ? srow0[x3] : borderValue;
currx[1] = srow1[x3] ;
currx[2] = srow2 ? srow2[x3] : borderValue;
}
// make shift
if (x)
{
tprev[0] = tcurr[0];
tcurr[0] = tnext[0];
tprev[1] = tcurr[1];
tcurr[1] = tnext[1];
tprev[2] = tcurr[2];
tcurr[2] = tnext[2];
}
tnext[0] = x0;
tnext[1] = x1;
tnext[2] = x2;
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr[0] = tcurr[1] = tcurr[2] = v_border;
else if (border == BORDER_MODE_REPLICATE)
{
tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
}
continue;
}
int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[0], tcurr[0], 7);
t1 = tcurr[0];
t2 = vext_u8(tcurr[0], tnext[0], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[1], tcurr[1], 7);
t1 = tcurr[1];
t2 = vext_u8(tcurr[1], tnext[1], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[2], tcurr[2], 7);
t1 = tcurr[2];
t2 = vext_u8(tcurr[2], tnext[2], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
}
// make scale
v_dst0 = vshrq_s32_p(v_dst0);
v_dst1 = vshrq_s32_p(v_dst1);
// and add them
vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
vqmovun_s32(v_dst1))));
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
{
nextx[0] = borderValue;
nextx[1] = borderValue;
nextx[2] = borderValue;
}
else if (border == BORDER_MODE_REPLICATE)
{
nextx[0] = srow0[x];
nextx[1] = srow1[x];
nextx[2] = srow2[x];
}
}
else
{
nextx[0] = srow0 ? srow0[x + 1] : borderValue;
nextx[1] = srow1[x + 1] ;
nextx[2] = srow2 ? srow2[x + 1] : borderValue;
}
s32 val = 0;
for (s32 _y = 0; _y < 3; ++_y)
val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
drow[x] = internal::saturate_cast<u8>(val >> scale);
// make shift
prevx[0] = currx[0];
currx[0] = nextx[0];
prevx[1] = currx[1];
currx[1] = nextx[1];
prevx[2] = currx[2];
currx[2] = nextx[2];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
(void)ksize;
(void)kernelBase;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

430
3rdparty/carotene/src/count_nonzero.cpp vendored Normal file
View File

@ -0,0 +1,430 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <limits>
namespace CAROTENE_NS {
s32 countNonZero(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw16 = size.width & ~15u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO8U_BLOCK_SIZE (16*255)
uint8x16_t vc1 = vmovq_n_u8(1);
for (; i < roiw16;)
{
size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
uint8x16_t vs = vmovq_n_u8(0);
for (; i <= lim; i+= 16)
{
internal::prefetch(src + i);
uint8x16_t vln = vld1q_u8(src + i);
uint8x16_t vnz = vminq_u8(vln, vc1);
vs = vaddq_u8(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const u16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
uint16x8_t vc1 = vmovq_n_u16(1);
for (; i < roiw8;)
{
size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
uint16x8_t vs = vmovq_n_u16(0);
for (; i <= lim; i+= 8)
{
internal::prefetch(src + i);
uint16x8_t vln = vld1q_u16(src + i);
uint16x8_t vnz = vminq_u16(vln, vc1);
vs = vaddq_u16(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vs);
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const s32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);
u32 i = 0;
uint32x4_t vc1 = vmovq_n_u32(1);
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
uint32x4_t vln = vld1q_u32(src + i);
uint32x4_t vnz = vminq_u32(vln, vc1);
vs = vqaddq_u32(vs, vnz);
}
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
float32x4_t vc0 = vmovq_n_f32(0);
int32x4_t vs = vmovq_n_s32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
float32x4_t vln = vld1q_f32(src + i);
int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
vs = vqaddq_s32(vs, vnz);
}
int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
int s[2];
vst1_s32(s, vs2);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f64 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
size_t roiw4 = size.width & ~3u;
size_t roiw2 = size.width & ~1u;
uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
uint32x4_t vc0 = vmovq_n_u32(0);
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f64* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
int32x2_t vs1 = vmov_n_s32(0);
int32x2_t vs2 = vmov_n_s32(0);
int32x2_t vs3 = vmov_n_s32(0);
int32x2_t vs4 = vmov_n_s32(0);
for (; i < roiw8; i += 8 )
{
internal::prefetch(src + i + 6);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint64x2_t vm3 = vandq_u64(vln3, vmask1);
uint64x2_t vm4 = vandq_u64(vln4, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
uint32x4_t vlx3 = vmvnq_u32(vequ3);
uint32x4_t vlx4 = vmvnq_u32(vequ4);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
vs3 = vqadd_s32(vs3, vnz3);
vs4 = vqadd_s32(vs4, vnz4);
}
if (i < roiw4)
{
internal::prefetch(src + i + 2);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
i += 4;
}
if (i < roiw2)
{
internal::prefetch(src + i);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
vs1 = vqadd_s32(vs1, vnz1);
i += 2;
}
vs1 = vqadd_s32(vs1, vs2);
vs3 = vqadd_s32(vs3, vs4);
vs1 = vqadd_s32(vs1, vs3);
int32x2_t vsneg = vqneg_s32(vs1);
s32 s[2];
vst1_s32(s, vsneg);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
} // namespace CAROTENE_NS

694
3rdparty/carotene/src/div.cpp vendored Normal file
View File

@ -0,0 +1,694 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
#include <cfloat>
#include <cmath>
#include <limits>
namespace CAROTENE_NS {
namespace {
#ifdef CAROTENE_NEON
template <typename T>
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <>
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
template <typename T>
inline T divSaturate(const T &v1, const T &v2, const float scale)
{
return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <>
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
template <typename T>
inline T divWrapQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <>
inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
template <typename T>
inline T divWrap(const T &v1, const T &v2, const float scale)
{
return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <>
inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); }
inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); }
inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); }
inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); }
inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); }
inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); }
inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); }
inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); }
inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
#endif
template <typename T>
void div(const Size2D &size,
const T * src0Base, ptrdiff_t src0Stride,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
(scale * std::numeric_limits<T>::max()) < 1.0f &&
(scale * std::numeric_limits<T>::max()) > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
#ifdef CAROTENE_NEON
template <typename T>
inline T recipSaturateQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipSaturate(const T &v2, const float scale)
{
return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrapQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrap(const T &v2, const float scale)
{
return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
#endif
template <typename T>
void recip(const Size2D &size,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
scale < 1.0f &&
scale > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
}
void div(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
float32x4_t v_zero = vdupq_n_f32(0.0f);
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
internal::vrecpq_f32(v_src1))), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
internal::vrecp_f32(v_src1))), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
void reciprocal(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s8 * srcBase, ptrdiff_t srcStride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
float32x4_t v_zero = vdupq_n_f32(0.0f);
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? 1.0f / src1[j] : 0;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
scale)),v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
scale)), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? scale / src1[j] : 0;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

260
3rdparty/carotene/src/dot_product.cpp vendored Normal file
View File

@ -0,0 +1,260 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
f64 dotProduct(const Size2D &_size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
uint64x2_t ws = vmovq_n_u64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
uint32x4_t s1 = vmovq_n_u32(0);
uint32x4_t s2 = vmovq_n_u32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
uint8x16_t vs1 = vld1q_u8(src0 + i);
uint8x16_t vs2 = vld1q_u8(src1 + i);
uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
s1 = vpadalq_u16(s1, vdot1);
s2 = vpadalq_u16(s2, vdot2);
}
ws = vpadalq_u32(ws, s1);
ws = vpadalq_u32(ws, s2);
}
if(i + 8 <= size.width)
{
uint8x8_t vs1 = vld1_u8(src0 + i);
uint8x8_t vs2 = vld1_u8(src1 + i);
ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
int64x2_t ws = vmovq_n_s64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
int32x4_t s1 = vmovq_n_s32(0);
int32x4_t s2 = vmovq_n_s32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
int8x16_t vs1 = vld1q_s8(src0 + i);
int8x16_t vs2 = vld1q_s8(src1 + i);
int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
s1 = vpadalq_s16(s1, vdot1);
s2 = vpadalq_s16(s2, vdot2);
}
ws = vpadalq_s32(ws, s1);
ws = vpadalq_s32(ws, s2);
}
if(i + 8 <= size.width)
{
int8x8_t vs1 = vld1_s8(src0 + i);
int8x8_t vs2 = vld1_s8(src1 + i);
ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
{
size.width *= size.height;
size.height = 1;
}
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
while(i + 4 <= size.width)
{
size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
float32x4_t v_sum = vdupq_n_f32(0.0f);
for( ; i <= lim; i += 4 )
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
}
float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
}
if(i + 2 <= size.width)
{
float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
i += 2;
}
for (; i < size.width; ++i)
result += src0[i] * src1[i];
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
} // namespace CAROTENE_NS

428
3rdparty/carotene/src/fast.cpp vendored Normal file
View File

@ -0,0 +1,428 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
Below is the original copyright and the references */
/*
Copyright (c) 2006, 2008 Edward Rosten
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
*Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
*Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
*Neither the name of the University of Cambridge nor the names of
its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
The references are:
* Machine learning for high-speed corner detection,
E. Rosten and T. Drummond, ECCV 2006
* Faster and better: A machine learning approach to corner detection
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace
{
void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
{
pixel[0] = 0 + row_stride * 3;
pixel[1] = 1 + row_stride * 3;
pixel[2] = 2 + row_stride * 2;
pixel[3] = 3 + row_stride * 1;
pixel[4] = 3 + row_stride * 0;
pixel[5] = 3 + row_stride * -1;
pixel[6] = 2 + row_stride * -2;
pixel[7] = 1 + row_stride * -3;
pixel[8] = 0 + row_stride * -3;
pixel[9] = -1 + row_stride * -3;
pixel[10] = -2 + row_stride * -2;
pixel[11] = -3 + row_stride * -1;
pixel[12] = -3 + row_stride * 0;
pixel[13] = -3 + row_stride * 1;
pixel[14] = -2 + row_stride * 2;
pixel[15] = -1 + row_stride * 3;
}
u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
{
const s32 K = 8, N = 16 + K + 1;
s32 k, v = ptr[0];
s16 d[(N + 7) & ~7];
for( k = 0; k < N; k++ )
d[k] = (s16)(v - ptr[pixel[k]]);
int16x8_t q0 = vdupq_n_s16((s16)(-1000));
int16x8_t q1 = vdupq_n_s16((s16)(1000));
int16x8_t d0_7 = vld1q_s16(d + 0);
int16x8_t d8_15 = vld1q_s16(d + 8);
int16x8_t d16_23 = vld1q_s16(d + 16);
int16x8_t d24 = vld1q_s16(d + 24);
//k == 0
int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
int16x8_t ak0 = vminq_s16(v0k0, v1k0);
int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 3);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 4);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 5);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 6);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 7);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
ak0 = vminq_s16(ak0, d8_15);
bk0 = vmaxq_s16(bk0, d8_15);
q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
v1k0 = vextq_s16(d8_15, d16_23, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
//k == 8
int16x8_t v0k8 = v1k0;
int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
int16x8_t ak8 = vminq_s16(v0k8, v1k8);
int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 3);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 4);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 5);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 6);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 7);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
ak8 = vminq_s16(ak8, d16_23);
bk8 = vmaxq_s16(bk8, d16_23);
q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
v1k8 = vextq_s16(d16_23, d24, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
//fin
int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
int32x4_t q2w = vmovl_s16(q2);
int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
return (u8)(vget_lane_s32(q8, 0) - 1);
}
} //namespace
#endif
void FAST(const Size2D &size,
u8 *srcBase, ptrdiff_t srcStride,
KeypointStore *keypoints,
u8 threshold, bool nonmax_suppression)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
//keypoints.clear();
const s32 K = 8, N = 16 + K + 1;
ptrdiff_t i, j, k, pixel[N];
makeOffsets(pixel, srcStride);
for(k = 16; k < N; k++)
pixel[k] = pixel[k - 16];
uint8x16_t delta = vdupq_n_u8(128);
uint8x16_t t = vdupq_n_u8(threshold);
uint8x16_t K16 = vdupq_n_u8((u8)K);
u8 threshold_tab[512];
for( i = -255; i <= 255; i++ )
threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
u8* buf[3];
buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
ptrdiff_t* cpbuf[3];
cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
cpbuf[1] = cpbuf[0] + size.width + 1;
cpbuf[2] = cpbuf[1] + size.width + 1;
memset(buf[0], 0, size.width*3);
for(i = 3; i < (ptrdiff_t)size.height-2; i++)
{
const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
u8* curr = buf[(i - 3)%3];
ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
memset(curr, 0, size.width);
ptrdiff_t ncorners = 0;
if( i < (ptrdiff_t)size.height - 3 )
{
j = 3;
for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
{
internal::prefetch(ptr);
internal::prefetch(ptr + pixel[0]);
internal::prefetch(ptr + pixel[2]);
uint8x16_t v0 = vld1q_u8(ptr);
int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
uint8x16_t m0 = vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
uint8x16_t m1 = vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
m0 = vorrq_u8(m0, m1);
u64 mask[2];
vst1q_u64(mask, vreinterpretq_u64_u8(m0));
if( mask[0] == 0 )
{
if (mask[1] != 0)
{
j -= 8;
ptr -= 8;
}
continue;
}
uint8x16_t c0 = vmovq_n_u8(0);
uint8x16_t c1 = vmovq_n_u8(0);
uint8x16_t max0 = vmovq_n_u8(0);
uint8x16_t max1 = vmovq_n_u8(0);
for( k = 0; k < N; k++ )
{
int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
m0 = vcgtq_s8(x, v2);
m1 = vcgtq_s8(v1, x);
c0 = vandq_u8(vsubq_u8(c0, m0), m0);
c1 = vandq_u8(vsubq_u8(c1, m1), m1);
max0 = vmaxq_u8(max0, c0);
max1 = vmaxq_u8(max1, c1);
}
max0 = vmaxq_u8(max0, max1);
u8 m[16];
vst1q_u8(m, vcgtq_u8(max0, K16));
for( k = 0; k < 16; ++k )
if(m[k])
{
cornerpos[ncorners++] = j+k;
if(nonmax_suppression)
curr[j+k] = cornerScore(ptr+k, pixel);
}
}
for( ; j < (s32)size.width - 3; j++, ptr++ )
{
s32 v = ptr[0];
const u8* tab = &threshold_tab[0] - v + 255;
s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
if( d & 1 )
{
s32 vt = v - threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x < vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
if( d & 2 )
{
s32 vt = v + threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x > vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
}
}
cornerpos[-1] = ncorners;
if( i == 3 )
continue;
const u8* prev = buf[(i - 4 + 3)%3];
const u8* pprev = buf[(i - 5 + 3)%3];
cornerpos = cpbuf[(i - 4 + 3)%3];
ncorners = cornerpos[-1];
for( k = 0; k < ncorners; k++ )
{
j = cornerpos[k];
s32 score = prev[j];
if( !nonmax_suppression ||
(score > prev[j+1] && score > prev[j-1] &&
score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
{
keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)keypoints;
(void)threshold;
(void)nonmax_suppression;
#endif
}
} // namespace CAROTENE_NS

442
3rdparty/carotene/src/fill_minmaxloc.cpp vendored Normal file
View File

@ -0,0 +1,442 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void process(const T * src, size_t j0, size_t j1, size_t i,
T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
for (size_t j = j0; j < j1; ++j)
{
T val = src[j];
if (val == maxVal)
{
if (maxLocCount < maxLocCapacity)
{
maxLocPtr[maxLocCount] = j;
maxLocPtr[maxLocCount + 1] = i;
}
maxLocCount += 2;
}
if (val == minVal)
{
if (minLocCount < minLocCapacity)
{
minLocPtr[minLocCount] = j;
minLocPtr[minLocCount + 1] = i;
}
minLocCount += 2;
}
}
}
} // namespace
#endif
void fillMinMaxLocs(const Size2D & size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
vst1q_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
v_minval8 = vdupq_n_u16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint16x8_t v_src = vld1q_u16(src + j);
uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
v_minval8 = vdupq_n_s16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int16x8_t v_src = vld1q_s16(src + j);
uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
v_minval4 = vdupq_n_s32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u32 * srcBase, ptrdiff_t srcStride,
u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
v_minval4 = vdupq_n_u32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
} // namespace CAROTENE_NS

222
3rdparty/carotene/src/flip.cpp vendored Normal file
View File

@ -0,0 +1,222 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
{
bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
return isSupportedConfiguration() &&
((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
(flipMode == FLIP_VERTICAL_MODE));
}
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void flip(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
typedef typename VecTraits<T>::vec128 vec128;
typedef typename VecTraits<T>::vec64 vec64;
u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t js = 0, jd = size.width;
for (; js < roiw_base; js += step_base, jd -= step_base)
{
prefetch(src + js);
vec128 v_src = vld1q(src + js);
vec128 v_dst = vrev64q(v_src);
v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
vst1q(dst + jd - step_base, v_dst);
}
for (; js < roiw_tail; js += step_tail, jd -= step_tail)
{
vec64 v_src = vld1(src + js);
vst1(dst + jd - step_tail, vrev64(v_src));
}
for (--jd; js < size.width; ++js, --jd)
dst[jd] = src[js];
}
}
template <typename T>
void flip3(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
#ifndef ANDROID
typedef typename VecTraits<T, 3>::vec128 vec128;
#endif
typedef typename VecTraits<T, 3>::vec64 vec64;
#ifndef ANDROID
u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
#endif
u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t j = 0, js = 0, jd = size.width * 3;
#ifndef ANDROID
for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
{
prefetch(src + js);
vec128 v_src = vld3q(src + js), v_dst;
v_src.val[0] = vrev64q(v_src.val[0]);
v_src.val[1] = vrev64q(v_src.val[1]);
v_src.val[2] = vrev64q(v_src.val[2]);
v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
vst3q(dst + jd - step_base3, v_dst);
}
#endif // ANDROID
for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
{
vec64 v_src = vld3(src + js), v_dst;
v_dst.val[0] = vrev64(v_src.val[0]);
v_dst.val[1] = vrev64(v_src.val[1]);
v_dst.val[2] = vrev64(v_src.val[2]);
vst3(dst + jd - step_tail3, v_dst);
}
for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
{
dst[jd] = src[js];
dst[jd + 1] = src[js + 1];
dst[jd + 2] = src[js + 2];
}
}
}
typedef void (* flipFunc)(const Size2D &size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode);
} // namespace
#endif
void flip(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode, u32 elemSize)
{
internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
#ifdef CAROTENE_NEON
if (flipMode == FLIP_VERTICAL_MODE)
{
for (size_t y = 0; y < size.height; ++y)
{
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
std::memcpy(dst_row, src_row, elemSize * size.width);
}
return;
}
flipFunc func = NULL;
if (elemSize == (u32)sizeof(u8))
func = &flip<u8>;
if (elemSize == (u32)sizeof(u16))
func = &flip<u16>;
if (elemSize == (u32)sizeof(u32))
func = &flip<u32>;
if (elemSize == (u32)sizeof(u8) * 3)
func = &flip3<u8>;
if (func == NULL)
return;
func(size,
srcBase, srcStride,
dstBase, dstStride,
flipMode);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)flipMode;
(void)elemSize;
#endif
}
} // namespace CAROTENE_NS

1059
3rdparty/carotene/src/gaussian_blur.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

195
3rdparty/carotene/src/in_range.cpp vendored Normal file
View File

@ -0,0 +1,195 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename T, int elsize> struct vtail
{
static inline void inRange(const T *, const T *, const T *,
u8 *, size_t &, size_t)
{
//do nothing since there couldn't be enough data
}
};
template <typename T> struct vtail<T, 2>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1(dst + x, internal::vmovn(vd));
x+=8;
}
}
};
template <typename T> struct vtail<T, 1>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1q(dst + x, vd);
x+=16;
}
if( x + 8 < width)
{
vec64 vs = internal::vld1( src + x);
vec64 vr1 = internal::vld1(rng1 + x);
vec64 vr2 = internal::vld1(rng2 + x);
uvec64 vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
internal::vst1(dst + x, vd);
x+=8;
}
}
};
template <typename T>
inline void inRangeCheck(const Size2D &_size,
const T * srcBase, ptrdiff_t srcStride,
const T * rng1Base, ptrdiff_t rng1Stride,
const T * rng2Base, ptrdiff_t rng2Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
Size2D size(_size);
if (srcStride == dstStride &&
srcStride == rng1Stride &&
srcStride == rng2Stride &&
srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width & ~( 32/sizeof(T) - 1 );
for(size_t j = 0; j < size.height; ++j)
{
const T * src = internal::getRowPtr( srcBase, srcStride, j);
const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
u8 * dst = internal::getRowPtr( dstBase, dstStride, j);
size_t i = 0;
for( ; i < width; i += 32/sizeof(T) )
{
internal::prefetch(src + i);
internal::prefetch(rng1 + i);
internal::prefetch(rng2 + i);
vec128 vs = internal::vld1q( src + i);
vec128 vr1 = internal::vld1q(rng1 + i);
vec128 vr2 = internal::vld1q(rng2 + i);
uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vs = internal::vld1q( src + i + 16/sizeof(T));
vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vnst(dst + i, vd1, vd2);
}
vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
for( ; i < size.width; i++ )
dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
}
}
}
#define INRANGEFUNC(T) \
void inRange(const Size2D &_size, \
const T * srcBase, ptrdiff_t srcStride, \
const T * rng1Base, ptrdiff_t rng1Stride, \
const T * rng2Base, ptrdiff_t rng2Stride, \
u8 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
inRangeCheck(_size, srcBase, srcStride, \
rng1Base, rng1Stride, rng2Base, rng2Stride, \
dstBase, dstStride); \
}
#else
#define INRANGEFUNC(T) \
void inRange(const Size2D &, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
u8 *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
INRANGEFUNC(u8)
INRANGEFUNC(s8)
INRANGEFUNC(u16)
INRANGEFUNC(s16)
INRANGEFUNC(s32)
INRANGEFUNC(f32)
} // namespace CAROTENE_NS

238
3rdparty/carotene/src/integral.cpp vendored Normal file
View File

@ -0,0 +1,238 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
void integral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumBase, ptrdiff_t sumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint32x4_t v_zero = vmovq_n_u32(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
uint32x4_t prev = v_zero;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
uint32x4_t vsumh = vaddw_u16(prev, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
sum = internal::getRowPtr(sumBase, sumStride, i);
prev = v_zero;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint32x4_t vsuml = vld1q_u32(prevSum + j);
uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
vsuml = vaddq_u32(vsuml, prev);
vsumh = vaddq_u32(vsumh, prev);
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
vsumh = vaddw_u16(vsumh, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]) + prevSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sumBase;
(void)sumStride;
#endif
}
void sqrIntegral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sqsumBase, ptrdiff_t sqsumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint16x8_t v_zero8 = vmovq_n_u16(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
double prev = 0.;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
prev = 0.;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sqsumBase;
(void)sqsumStride;
#endif
}
} // namespace CAROTENE_NS

112
3rdparty/carotene/src/intrinsics.hpp vendored Normal file
View File

@ -0,0 +1,112 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_INTRINSICS_HPP
#define CAROTENE_INTRINSICS_HPP
#include <carotene/definitions.hpp>
#include <arm_neon.h>
namespace CAROTENE_NS { namespace internal {
/////////////// Custom NEON intrinsics ///////////////////
// calculate reciprocal value
inline float32x4_t vrecpq_f32(float32x4_t val)
{
float32x4_t reciprocal = vrecpeq_f32(val);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
return reciprocal;
}
inline float32x2_t vrecp_f32(float32x2_t val)
{
float32x2_t reciprocal = vrecpe_f32(val);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
return reciprocal;
}
// caclulate sqrt value
inline float32x4_t vrsqrtq_f32(float32x4_t val)
{
float32x4_t e = vrsqrteq_f32(val);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
return e;
}
inline float32x2_t vrsqrt_f32(float32x2_t val)
{
float32x2_t e = vrsqrte_f32(val);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
return e;
}
inline float32x4_t vsqrtq_f32(float32x4_t val)
{
return vrecpq_f32(vrsqrtq_f32(val));
}
inline float32x2_t vsqrt_f32(float32x2_t val)
{
return vrecp_f32(vrsqrt_f32(val));
}
// table lookup with the table in a 128-bit register
inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
{
#ifdef __aarch64__
// AArch64 supports this natively
return ::vqtbl1_u8(a, b);
#else
union { uint8x16_t v; uint8x8x2_t w; } u = { a };
return vtbl2_u8(u.w, b);
#endif
}
} }
#endif

713
3rdparty/carotene/src/laplacian.cpp vendored Normal file
View File

@ -0,0 +1,713 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
namespace CAROTENE_NS {
bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
#ifdef CAROTENE_NEON
const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
const uint16x8_t v_zero = vdupq_n_u16(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
uint8x8_t vsub;
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
s16 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border_x3;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
vsub = x1;
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u16(tprev, tcurr, 7);
t1 = tcurr;
t2 = vextq_u16(tcurr, tnext, 1);
// and add them
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
uint8x8_t it0 = vqmovun_s16(tt0);
vst1_u8(drow + x - 8, it0);
vsub = x1;
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue * 3;
else if (border == BORDER_MODE_REPLICATE)
nextx = srow2[x] + srow1[x] + srow0[x];
}
else
{
nextx = (srow2 ? srow2[x + 1] : borderValue) +
srow1[x + 1] +
(srow0 ? srow0[x + 1] : borderValue);
}
s32 val = (prevx + currx + nextx) - 9 * srow1[x];
drow[x] = internal::saturate_cast<u8>((s32)val);
// make shift
prevx = currx;
currx = nextx;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() &&
size.width >= 8 && size.height >= 1 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian1OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t t0, t2;
uint8x8_t xx0 = vmov_n_u8(0x0);
uint8x8_t xx1 = vmov_n_u8(0x0);
uint8x8_t xx2 = vmov_n_u8(0x0);
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
if(x) {
xx0 = xx1;
xx1 = xx2;
} else {
xx1 = x1;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
xx1 = vset_lane_u8(borderValue, x1, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
}
}
xx2 = x1;
if(x) {
tcurr = tnext;
}
tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
if(!x) {
tcurr = tnext;
continue;
}
t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx;
s16 prevx;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v1[0] : v1[x-1];
nextx = x == cols-1 ? v1[x] : v1[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v1[1] : v1[x-1];
nextx = x == cols-1 ? v1[x-1] : v1[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v1[x-1];
nextx = x == cols-1 ? borderValue : v1[x+1];
}
*(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian3OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tprev = vmovq_n_s16(0x0);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t tc = vmovq_n_s16(0x0);
int16x8_t t0, t2, tcnext;
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
if(x) {
tprev = tcurr;
tcurr = tnext;
}
tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
if(!x) {
tcurr = tnext;
tc = tcnext;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
}
continue;
}
t0 = vextq_s16(tprev, tcurr, 7);
t2 = vextq_s16(tcurr, tnext, 1);
t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
tc = tcnext;
t0 = vshlq_n_s16(t0, 1);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx, nextx2;
s16 prevx, prevx2;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v0[0] : v0[x-1];
prevx2 = x == 0 ? v2[0] : v2[x-1];
nextx = x == cols-1 ? v0[x] : v0[x+1];
nextx2 = x == cols-1 ? v2[x] : v2[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v0[1] : v0[x-1];
prevx2 = x == 0 ? v2[1] : v2[x-1];
nextx = x == cols-1 ? v0[x-1] : v0[x+1];
nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v0[x-1];
prevx2 = x == 0 ? borderValue : v2[x-1];
nextx = x == cols-1 ? borderValue : v0[x+1];
nextx2 = x == cols-1 ? borderValue : v2[x+1];
}
s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian5OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = 0;
const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v3 = 0;
const u8* v4 = 0;
// make border
if (border == BORDER_MODE_REPLICATE) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
} else if (border == BORDER_MODE_REFLECT) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
} else if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1)
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tnext, tc, t0;
int16x8_t tnext2, tnext3;
int16x8_t tnext1Old, tnext2Old, tnext3Old;
int16x8_t tnext4OldOldOld, tnext5OldOldOld;
int16x8_t tcurr1 = vmovq_n_s16(0x0);
int16x8_t tnext1 = vmovq_n_s16(0x0);
int16x8_t tprev1 = vmovq_n_s16(0x0);
int16x8_t tpprev1 = vmovq_n_s16(0x0);
int16x8_t tppprev1 = vmovq_n_s16(0x0);
int16x8_t tnext4Old = vmovq_n_s16(0x0);
int16x8_t tnext5Old = vmovq_n_s16(0x0);
int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
// do vertical convolution
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
internal::prefetch(v3 + x);
internal::prefetch(v4 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
uint8x8_t x3 = vld1_u8(v3 + x);
uint8x8_t x4 = vld1_u8(v4 + x);
if(x) {
tcurr1 = tnext1;
}
tnext4OldOldOld = tnext4Old;
tnext5OldOldOld = tnext5Old;
tnext1Old = tnext1OldOld;
tnext2Old = tnext2OldOld;
tnext3Old = tnext3OldOld;
tnext4Old = tnext4OldOld;
tnext5Old = tnext5OldOld;
tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
tnext3 = vshlq_n_s16(tnext3, 1);
tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
tnext2 = vsubq_s16(tc, tnext);
tnext1 = vaddq_s16(tnext3, tnext2);
// tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
tnext2 = vshlq_n_s16(tnext2, 1);
// tnext2 = 2*x4 - 4*x2 + 2*x0
tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
// tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
tnext1OldOld = tnext1;
tnext2OldOld = tnext2;
tnext3OldOld = tnext3;
tnext4OldOld = tnext2;
tnext5OldOld = tnext1;
if(x) {
tnext1 = vextq_s16(tnext1Old, tnext1, 2);
tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
tprev1 = tnext3Old;
if(x!=8) {
tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
}
}
if(!x) {
// make border
if (border == BORDER_MODE_REPLICATE) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT101) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
} else if (border == BORDER_MODE_CONSTANT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
}
tppprev1 = tprev1;
continue;
}
t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
t0 = vaddq_s16(t0, t0);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x >= cols - 1)
x = cols-2;
s16 pprevx = 0;
s16 prevx = 0;
s16 nextx = 0;
s16 nnextx = 0;
for( ; x < cols; x++ )
{
if (x == 0) {
// make border
if (border == BORDER_MODE_REPLICATE) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
prevx = 0;
}
} else if (x == 1) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
}
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else {
pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
}
s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
if (x == cols-1) {
// make border
if (border == BORDER_MODE_REPLICATE) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_REFLECT) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
} else if (border == BORDER_MODE_REFLECT101) {
nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
} else if (border == BORDER_MODE_CONSTANT) {
nextx = 0;
nnextx = 8 * borderValue;
}
} else if (x == cols-2) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
} else if (border == BORDER_MODE_REFLECT101) {
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_CONSTANT) {
nnextx = 8 * borderValue;
}
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
} else {
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
}
s16 res = pprevx + prevx + currx + nextx + nnextx;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

160
3rdparty/carotene/src/magnitude.cpp vendored Normal file
View File

@ -0,0 +1,160 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cmath>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct Magnitude
{
typedef s16 type;
void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
int16x8_t & v_dst) const
{
int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
v_src0_p = vget_high_s16(v_src0);
v_src1_p = vget_high_s16(v_src1);
float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
}
void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
int16x4_t & v_dst) const
{
float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
v_dst = vqmovn_s32(v_sqrt);
}
void operator() (const short * src0, const short * src1, short * dst) const
{
f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
}
};
struct MagnitudeF32
{
typedef f32 type;
void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
float32x4_t & v_dst) const
{
v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
}
void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
float32x2_t & v_dst) const
{
v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
}
};
} // namespace
#endif
void magnitude(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
Magnitude());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void magnitude(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
MagnitudeF32());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

163
3rdparty/carotene/src/meanstddev.cpp vendored Normal file
View File

@ -0,0 +1,163 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cmath>
namespace CAROTENE_NS {
void meanStdDev(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f64 fsum = 0.0f, fsqsum = 0.0f;
sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
void meanStdDev(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
f64 fsum = 0.0f, fsqsum = 0.0f;
f32 arsum[8];
uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw4)
{
size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
v_sum = v_zero;
v_sqsum = v_zero_f;
for ( ; j + 16 < blockSize ; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
// 0
uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
// 1
v_srclo = vmovl_u16(vget_low_u16(v_src1));
v_srchi = vmovl_u16(vget_high_u16(v_src1));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
v_srclo_f = vcvtq_f32_u32(v_srclo);
v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
}
for ( ; j < blockSize; j += 4)
{
uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
float32x4_t v_src_f = vcvtq_f32_u32(v_src);
v_sum = vaddq_u32(v_sum, v_src);
v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
}
vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
vst1q_f32(arsum + 4, v_sqsum);
fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
}
// collect a few last elements in the current row
for ( ; j < size.width; ++j)
{
f32 srcval = src[j];
fsum += srcval;
fsqsum += srcval * srcval;
}
}
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
} // namespace CAROTENE_NS

227
3rdparty/carotene/src/median_filter.cpp vendored Normal file
View File

@ -0,0 +1,227 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
/*
* The code here is based on the code in
* <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
* See also <http://ndevilla.free.fr/median/median/index.html>.
*/
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
{
u8 buf[16+8];
vst1q_u8(buf+cn, r);
for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
return vld1q_u8(buf);
}
uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
{
u8 buf[8+8];
vst1_u8(buf, r);
for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
return vld1_u8(buf+cn);
}
} // namespace
//o------^-------^-----------------------------o 0
// | |
//o--^---v---^---|-------^---------------------o 1
// | | | |
//o--v-------v---|-------|-^-------^-------^---o 2
// | | | | |
//o------^-------v-----^-|-|-------|-------|---o 3
// | | | | | |
//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
// | | | | | | |
//o--v-------v---^-|---|---v---|-------|-------o 5
// | | | | |
//o------^-------|-|---v-------|-------v-------o 6
// | | | |
//o--^---v---^---|-v-----------v---------------o 7
// | | |
//o--v-------v---v-----------------------------o 8
#define ELT(num, level) v ## num ## _lv ## level
#define PIX_SORT(a, alvl, b, blvl, newlvl) \
PIX_MIN(a, alvl, b, blvl, newlvl); \
PIX_MAX(a, alvl, b, blvl, newlvl);
#define SORT9 \
PIX_SORT(1, 00, 2, 00, 01); \
PIX_SORT(4, 00, 5, 00, 02); \
PIX_SORT(7, 00, 8, 00, 03); \
PIX_SORT(0, 00, 1, 01, 04); \
PIX_SORT(3, 00, 4, 02, 05); \
PIX_SORT(6, 00, 7, 03, 06); \
PIX_SORT(1, 04, 2, 01, 07); \
PIX_SORT(4, 05, 5, 02, 08); \
PIX_SORT(7, 06, 8, 03, 09); \
PIX_MAX (0, 04, 3, 05, 10); \
PIX_MIN (5, 08, 8, 09, 11); \
PIX_SORT(4, 08, 7, 09, 12); \
PIX_MAX (3, 10, 6, 06, 13); \
PIX_MAX (1, 07, 4, 12, 14); \
PIX_MIN (2, 07, 5, 11, 15); \
PIX_MIN (4, 14, 7, 12, 16); \
PIX_SORT(4, 16, 2, 15, 17); \
PIX_MAX (6, 13, 4, 17, 18); \
PIX_MIN (4, 18, 2, 17, 19);
#endif
bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
{
return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
}
void medianFilter3x3(const Size2D &size, u32 numChannels,
const u8 *srcBase, ptrdiff_t srcStride,
const Margin &srcMargin,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
#ifdef CAROTENE_NEON
u32 cn = numChannels;
size_t colsn = size.width * cn;
for (size_t i = 0; i < size.height; ++i) {
const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
{
uint8x16_t v3_lv00 = vld1q_u8(psrc0);
uint8x16_t v4_lv00 = vld1q_u8(psrc1);
uint8x16_t v5_lv00 = vld1q_u8(psrc2);
uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
goto medianBlur3x3_mainBody;
for (; j < colsn - 16; j += 16) {
internal::prefetch(psrc0 + j);
internal::prefetch(psrc1 + j);
internal::prefetch(psrc2 + j);
v0_lv00 = vld1q_u8(psrc0 + j - cn);
v1_lv00 = vld1q_u8(psrc1 + j - cn);
v2_lv00 = vld1q_u8(psrc2 + j - cn);
v3_lv00 = vld1q_u8(psrc0 + j);
v4_lv00 = vld1q_u8(psrc1 + j);
v5_lv00 = vld1q_u8(psrc2 + j);
v6_lv00 = vld1q_u8(psrc0 + j + cn);
v7_lv00 = vld1q_u8(psrc1 + j + cn);
v8_lv00 = vld1q_u8(psrc2 + j + cn);
medianBlur3x3_mainBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1q_u8(pdst + j, v4_lv19);
}
}
{
size_t k = colsn - 8;
uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
goto medianBlur3x3_tailBody;
for (; k >= j - 8; k -= 8) {
v0_lv00 = vld1_u8(psrc0 + k - cn);
v1_lv00 = vld1_u8(psrc1 + k - cn);
v2_lv00 = vld1_u8(psrc2 + k - cn);
v3_lv00 = vld1_u8(psrc0 + k);
v4_lv00 = vld1_u8(psrc1 + k);
v5_lv00 = vld1_u8(psrc2 + k);
v6_lv00 = vld1_u8(psrc0 + k + cn);
v7_lv00 = vld1_u8(psrc1 + k + cn);
v8_lv00 = vld1_u8(psrc2 + k + cn);
medianBlur3x3_tailBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1_u8(pdst + k, v4_lv19);
}
}
}
#else
(void)size;
(void)numChannels;
(void)srcBase;
(void)srcStride;
(void)srcMargin;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

139
3rdparty/carotene/src/min_max.cpp vendored Normal file
View File

@ -0,0 +1,139 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct Min
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vminq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmin(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::min(src0[0], src1[0]);
}
};
template <typename T>
struct Max
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vmaxq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmax(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::max(src0[0], src1[0]);
}
};
} // namespace
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, op<type>()); \
}
#else
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
IMPL_MINMAX(u8)
IMPL_MINMAX(s8)
IMPL_MINMAX(u16)
IMPL_MINMAX(s16)
IMPL_MINMAX(u32)
IMPL_MINMAX(s32)
IMPL_MINMAX(f32)
} // namespace CAROTENE_NS

1340
3rdparty/carotene/src/minmaxloc.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

728
3rdparty/carotene/src/morph.cpp vendored Normal file
View File

@ -0,0 +1,728 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <algorithm>
#include <limits>
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 16 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
#ifdef CAROTENE_NEON
namespace {
struct ErodeVecOp
{
ErodeVecOp():borderValue(0){}
ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::max();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vminq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmin_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::min(a, b);
}
u8 borderValue;
};
struct DilateVecOp
{
DilateVecOp():borderValue(0){}
DilateVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::min();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vmaxq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmax_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::max(a, b);
}
u8 borderValue;
};
template <typename VecOp>
void morph3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, const VecOp & vop)
{
u8 borderValue = vop.borderValue;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
const uint8x16_t v_zero = vdupq_n_u8(0);
const uint8x16_t v_border = vdupq_n_u8(borderValue);
uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
// perform vertical convolution
for ( ; x <= bwidth; x += 16)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
uint8x16_t x1 = vld1q_u8(srow1 + x);
uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 16 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = vop(srow1[x4],
vop(srow2 ? srow2[x4] : borderValue,
srow0 ? srow0[x4] : borderValue));
currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vop(vop(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u8(tprev, tcurr, 15);
t1 = tcurr;
t2 = vextq_u8(tcurr, tnext, 1);
// and add them
t0 = vop(t0, vop(t1, t2));
vst1q_u8(drow + x - 16, t0);
}
x -= 16;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue;
else if (border == BORDER_MODE_REPLICATE)
nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
}
else
nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
srow0 ? srow0[x + 1] : borderValue),
srow1[x + 1]);
drow[x] = vop(prevx, vop(currx, nextx));
// make shift
prevx = currx;
currx = nextx;
}
}
}
} // namespace
#endif
void erode3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, ErodeVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void dilate3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, DilateVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template<class VecUpdate>
void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
{
size_t i, j, k;
size_t width16 = (width & -16) * cn;
size_t width8 = (width & -8) * cn;
width *= cn;
if (ksize == 1)
{
for (i = 0; i < width; i++)
dst[i] = src[i];
return;
}
ksize = ksize*cn;
VecUpdate updateOp;
switch(cn)
{
case 1:
for (i = 0; i < width16; i += 16)
{
const u8* sptr = src + i;
uint8x16_t s = vld1q_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1q_u8(sptr + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
const u8* sptr = src + i;
uint8x8_t s = vld1_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1_u8(sptr + k));
vst1_u8(dst + i, s);
}
break;
default:
for (i = 0; i < width16; i += 16)
{
uint8x16_t s = vld1q_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1q_u8(src + i + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
uint8x8_t s = vld1_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1_u8(src + i + k));
vst1_u8(dst + i, s);
}
break;
}
ptrdiff_t i0 = i;
for( k = 0; k < (size_t)cn; k++, src++, dst++ )
{
for( i = i0; i <= width - cn*2; i += cn*2 )
{
const u8* s = src + i;
u8 m = s[cn];
for( j = cn*2; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = updateOp(m, s[0]);
dst[i+cn] = updateOp(m, s[j]);
}
for( ; i < width; i += cn )
{
const u8* s = src + i;
u8 m = s[0];
for( j = cn; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = m;
}
}
}
template<class VecUpdate>
void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
{
size_t i, k;
size_t width32 = width & -32;
VecUpdate updateOp;
uint8x16_t x0,x1,s0,s1;
if (ksize == 3)
{
for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
sptr = src[2] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[3] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
}
else if (ksize > 1)
for (; count > 1; count -= 2, dst += dststep*2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 2; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
for (; count > 0; count--, dst += dststep, src++)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[0] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 1; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
vst1q_u8(dst + i, s0);
vst1q_u8(dst + i + 16, s1);
}
for(; i < width; i++ )
{
u8 s = src[0][i];
for( k = 1; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = s;
}
}
}
template <class Op>
inline void morphology(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
//Temporary buffers common for all iterations
std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
u8* srcRow = &_srcRow[0];
size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
std::vector<u8*> _rows(bufRows);
u8** rows = &_rows[0];
// adjust swidthcn so that the used part of buffers stays compact in memory
ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
std::vector<u8> _ringBuf(swidthcn*bufRows+16);
u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
std::vector<ptrdiff_t> _borderTab(borderLength);
ptrdiff_t * borderTab = &_borderTab[0];
std::vector<u8> _constBorderValue;
std::vector<u8> _constBorderRow;
u8 * constBorderValue = NULL;
u8 * constBorderRow = NULL;
if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderValue.resize(borderLength);
constBorderValue = &_constBorderValue[0];
size_t i;
for(i = 0; i < cn; i++)
constBorderValue[i] = borderValues[i];
for(; i < borderLength; i++)
constBorderValue[i] = constBorderValue[i-cn];
if( columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
size_t N = (ssize.width + ksize.width - 1)*cn;
for( i = 0; i < N; i += borderLength )
{
size_t n = std::min( borderLength, N - i );
for(size_t j = 0; j < n; j++)
srcRow[i+j] = constBorderValue[j];
}
MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
}
}
Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
ssize.height + borderMargin.top + borderMargin.bottom);
ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
// recompute border tables
if( dx1 > 0 || dx2 > 0 )
{
if( rowBorderType == BORDER_MODE_CONSTANT )
{
memcpy( srcRow, &constBorderValue[0], dx1*cn );
memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
}
else
{
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
ptrdiff_t wholeWidth = wholeSize.width;
ptrdiff_t i, j;
for( i = 0; i < dx1; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[i*cn + j] = p0 + j;
}
for( i = 0; i < dx2; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[(i + dx1)*cn + j] = p0 + j;
}
}
}
ptrdiff_t startY, startY0, endY, rowCount;
startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
u8* dst = dstBase;
ptrdiff_t width = ssize.width, kwidth = ksize.width;
ptrdiff_t kheight = ksize.height, ay = anchorY;
ptrdiff_t width1 = ssize.width + kwidth - 1;
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
ptrdiff_t dy = 0, i = 0;
src -= xofs1*cn;
ptrdiff_t count = endY - startY;
rowCount = 0;
for(;; dst += dstStride*i, dy += i)
{
ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
dcount = std::min(dcount, count);
count -= dcount;
for( ; dcount-- > 0; src += srcStride )
{
ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
u8* brow = ringBuf + bi*swidthcn;
if( (size_t)(++rowCount) > bufRows )
{
--rowCount;
++startY;
}
memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
if( makeBorder )
{
for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
srcRow[i] = src[borderTab[i]];
for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
}
MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
}
ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
for( i = 0; i < max_i; i++ )
{
ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
wholeSize.height, columnBorderType);
if( srcY < 0 ) // can happen only with constant border type
rows[i] = constBorderRow;
else
{
if( srcY >= startY + rowCount )
break;
ptrdiff_t bi = (srcY - startY0) % bufRows;
rows[i] = ringBuf + bi*swidthcn;
}
}
if( i < kheight )
break;
i -= kheight - 1;
MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
}
}
} // namespace
#endif // CAROTENE_NEON
void erode(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
void dilate(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
} // namespace CAROTENE_NS

1572
3rdparty/carotene/src/mul.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

1310
3rdparty/carotene/src/norm.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

539
3rdparty/carotene/src/opticalflow.cpp vendored Normal file
View File

@ -0,0 +1,539 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <float.h> // For FLT_EPSILON
namespace CAROTENE_NS {
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
/*
* Pyramidal Lucas-Kanade Optical Flow level processing
*/
void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
const u8 *prevData, ptrdiff_t prevStride,
const s16 *prevDerivData, ptrdiff_t prevDerivStride,
const u8 *nextData, ptrdiff_t nextStride,
u32 ptCount,
const f32 *prevPts, f32 *nextPts,
u8 *status, f32 *err,
const Size2D &winSize,
u32 terminationCount, f64 terminationEpsilon,
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
f32 minEigThreshold)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;
s32 cn2 = cn*2;
std::vector<s16> _buf(winSize.total()*(cn + cn2));
s16* IWinBuf = &_buf[0];
s32 IWinBufStride = winSize.width*cn;
s16* derivIWinBuf = &_buf[winSize.total()*cn];
s32 derivIWinBufStride = winSize.width*cn2;
for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
{
f32 levscale = (1./(1 << level));
u32 ptref = ptidx << 1;
f32 prevPtX = prevPts[ptref+0]*levscale;
f32 prevPtY = prevPts[ptref+1]*levscale;
f32 nextPtX;
f32 nextPtY;
if( level == maxLevel )
{
if( useInitialFlow )
{
nextPtX = nextPts[ptref+0]*levscale;
nextPtY = nextPts[ptref+1]*levscale;
}
else
{
nextPtX = prevPtX;
nextPtY = prevPtY;
}
}
else
{
nextPtX = nextPts[ptref+0]*2.f;
nextPtY = nextPts[ptref+1]*2.f;
}
nextPts[ptref+0] = nextPtX;
nextPts[ptref+1] = nextPtY;
s32 iprevPtX, iprevPtY;
s32 inextPtX, inextPtY;
prevPtX -= halfWinX;
prevPtY -= halfWinY;
iprevPtX = floor(prevPtX);
iprevPtY = floor(prevPtY);
if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
{
if( level == 0 )
{
if( status )
status[ptidx] = false;
if( err )
err[ptidx] = 0;
}
continue;
}
f32 a = prevPtX - iprevPtX;
f32 b = prevPtY - iprevPtY;
const s32 W_BITS = 14, W_BITS1 = 14;
const f32 FLT_SCALE = 1.f/(1 << 20);
s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));
s32 iw10 = round((1.f - a)*b*(1 << W_BITS));
s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
s32 dstep = prevDerivStride/sizeof(s16);
f32 A11 = 0, A12 = 0, A22 = 0;
int16x4_t viw00 = vmov_n_s16((s16)iw00);
int16x4_t viw01 = vmov_n_s16((s16)iw01);
int16x4_t viw10 = vmov_n_s16((s16)iw10);
int16x4_t viw11 = vmov_n_s16((s16)iw11);
float32x4_t vA11 = vmovq_n_f32(0);
float32x4_t vA12 = vmovq_n_f32(0);
float32x4_t vA22 = vmovq_n_f32(0);
s32 wwcn = winSize.width*cn;
// extract the patch from the first image, compute covariation matrix of derivatives
s32 x = 0;
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;
const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;
s16* Iptr = IWinBuf + y*IWinBufStride;
s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
internal::prefetch(src + x + prevStride * 2, 0);
for(x = 0; x <= wwcn - 8; x += 8)
{
uint8x8_t vsrc00 = vld1_u8(src + x);
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));
}
for(; x <= wwcn - 4; x += 4)
{
uint8x8_t vsrc00 = vld1_u8(src + x);
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));
int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));
int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));
int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));
int32x4_t vsuml1 = vmull_s16(vs00, viw00);
int32x4_t vsuml2 = vmull_s16(vs01, viw01);
vsuml1 = vmlal_s16(vsuml1, vs10, viw10);
vsuml2 = vmlal_s16(vsuml2, vs11, viw11);
int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
vst1_s16(Iptr + x, vsumnl);
}
internal::prefetch(dsrc + dstep * 2, 0);
for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
{
#if __GNUC_MINOR__ < 0
__asm__ (
"vld2.16 {d0-d1}, [%[dsrc00]] \n\t"
"vld2.16 {d2-d3}, [%[dsrc10]] \n\t"
"vld2.16 {d4-d5}, [%[dsrc01]] \n\t"
"vld2.16 {d6-d7}, [%[dsrc11]] \n\t"
"vmull.s16 q4, d3, %P[viw10] \n\t"
"vmull.s16 q5, d0, %P[viw00] \n\t"
"vmlal.s16 q4, d7, %P[viw11] \n\t"
"vmlal.s16 q5, d4, %P[viw01] \n\t"
"vmlal.s16 q4, d1, %P[viw00] \n\t"
"vmlal.s16 q5, d2, %P[viw10] \n\t"
"vmlal.s16 q4, d5, %P[viw01] \n\t"
"vmlal.s16 q5, d6, %P[viw11] \n\t"
"vrshrn.s32 d13, q4, %[W_BITS1] \n\t"
"vrshrn.s32 d12, q5, %[W_BITS1] \n\t"
"vmull.s16 q3, d13, d13 \n\t"
"vmull.s16 q4, d12, d12 \n\t"
"vmull.s16 q5, d13, d12 \n\t"
"vcvt.f32.s32 q3, q3 \n\t"
"vcvt.f32.s32 q4, q4 \n\t"
"vcvt.f32.s32 q5, q5 \n\t"
"vadd.f32 %q[vA22], q3 \n\t"
"vadd.f32 %q[vA11], q4 \n\t"
"vadd.f32 %q[vA12], q5 \n\t"
"vst2.16 {d12-d13}, [%[out]] \n\t"
: [vA22] "=w" (vA22),
[vA11] "=w" (vA11),
[vA12] "=w" (vA12)
: "0" (vA22),
"1" (vA11),
"2" (vA12),
[out] "r" (dIptr),
[dsrc00] "r" (dsrc),
[dsrc10] "r" (dsrc + dstep),
[dsrc01] "r" (dsrc + cn2),
[dsrc11] "r" (dsrc + dstep + cn2),
[viw00] "w" (viw00),
[viw10] "w" (viw10),
[viw01] "w" (viw01),
[viw11] "w" (viw11),
[W_BITS1] "I" (W_BITS1)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
);
#else
int16x4x2_t vdsrc00 = vld2_s16(dsrc);
int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);
int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);
int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);
int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);
int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);
vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);
vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);
vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);
vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);
vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);
vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);
int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);
int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);
int32x4_t va22i = vmull_s16(vsumny, vsumny);
int32x4_t va11i = vmull_s16(vsumnx, vsumnx);
int32x4_t va12i = vmull_s16(vsumnx, vsumny);
float32x4_t va22f = vcvtq_f32_s32(va22i);
float32x4_t va11f = vcvtq_f32_s32(va11i);
float32x4_t va12f = vcvtq_f32_s32(va12i);
vA22 = vaddq_f32(vA22, va22f);
vA11 = vaddq_f32(vA11, va11f);
vA12 = vaddq_f32(vA12, va12f);
int16x4x2_t vsum;
vsum.val[0] = vsumnx;
vsum.val[1] = vsumny;
vst2_s16(dIptr, vsum);
#endif
}
for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )
{
s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);
s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
dsrc[dstep+cn2+1]*iw11, W_BITS1);
Iptr[x] = (s16)ival;
dIptr[0] = (s16)ixval;
dIptr[1] = (s16)iyval;
A11 += (f32)(ixval*ixval);
A12 += (f32)(ixval*iyval);
A22 += (f32)(iyval*iyval);
}
}
f32 A11buf[2], A12buf[2], A22buf[2];
vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));
vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));
vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));
A11 += A11buf[0] + A11buf[1];
A12 += A12buf[0] + A12buf[1];
A22 += A22buf[0] + A22buf[1];
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
f32 D = A11*A22 - A12*A12;
f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
4.f*A12*A12))/(2*winSize.width*winSize.height);
if( err && getMinEigenVals )
err[ptidx] = (f32)minEig;
if( minEig < minEigThreshold || D < FLT_EPSILON )
{
if( level == 0 && status )
status[ptidx] = false;
continue;
}
D = 1.f/D;
nextPtX -= halfWinX;
nextPtY -= halfWinY;
f32 prevDeltaX = 0;
f32 prevDeltaY = 0;
for(u32 j = 0; j < terminationCount; j++ )
{
inextPtX = floor(nextPtX);
inextPtY = floor(nextPtY);
if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
{
if( level == 0 && status )
status[ptidx] = false;
break;
}
a = nextPtX - inextPtX;
b = nextPtY - inextPtY;
iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
iw01 = round(a*(1.f - b)*(1 << W_BITS));
iw10 = round((1.f - a)*b*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
f32 b1 = 0, b2 = 0;
viw00 = vmov_n_s16((s16)iw00);
viw01 = vmov_n_s16((s16)iw01);
viw10 = vmov_n_s16((s16)iw10);
viw11 = vmov_n_s16((s16)iw11);
float32x4_t vb1 = vmovq_n_f32(0);
float32x4_t vb2 = vmovq_n_f32(0);
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;
const s16* Iptr = IWinBuf + y*IWinBufStride;
const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
x = 0;
internal::prefetch(Jptr, nextStride * 2);
internal::prefetch(Iptr, IWinBufStride/2);
internal::prefetch(dIptr, derivIWinBufStride/2);
for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )
{
uint8x8_t vj00 = vld1_u8(Jptr + x);
uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);
uint8x8_t vj01 = vld1_u8(Jptr + x + cn);
uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);
int16x8_t vI = vld1q_s16(Iptr + x);
int16x8x2_t vDerivI = vld2q_s16(dIptr);
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);
int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));
int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));
int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));
int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));
float32x4_t vb1f = vcvtq_f32_s32(vb1i);
float32x4_t vb2f = vcvtq_f32_s32(vb2i);
vb1 = vaddq_f32(vb1, vb1f);
vb2 = vaddq_f32(vb2, vb2f);
}
for( ; x < wwcn; x++, dIptr += 2 )
{
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
W_BITS1-5) - Iptr[x];
b1 += (f32)(diff*dIptr[0]);
b2 += (f32)(diff*dIptr[1]);
}
}
f32 bbuf[2];
float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));
vst1_f32(bbuf, vb);
b1 += bbuf[0];
b2 += bbuf[1];
b1 *= FLT_SCALE;
b2 *= FLT_SCALE;
f32 deltaX = (f32)((A12*b2 - A22*b1) * D);
f32 deltaY = (f32)((A12*b1 - A11*b2) * D);
nextPtX += deltaX;
nextPtY += deltaY;
nextPts[ptref+0] = nextPtX + halfWinX;
nextPts[ptref+1] = nextPtY + halfWinY;
if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )
break;
if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&
std::abs(deltaY + prevDeltaY) < 0.01 )
{
nextPts[ptref+0] -= deltaX*0.5f;
nextPts[ptref+1] -= deltaY*0.5f;
break;
}
prevDeltaX = deltaX;
prevDeltaY = deltaY;
}
if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
{
f32 nextPointX = nextPts[ptref+0] - halfWinX;
f32 nextPointY = nextPts[ptref+1] - halfWinY;
s32 inextPointX = floor(nextPointX);
s32 inextPointY = floor(nextPointY);
if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||
inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )
{
if( status )
status[ptidx] = false;
continue;
}
f32 aa = nextPointX - inextPointX;
f32 bb = nextPointY - inextPointY;
iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));
iw01 = round(aa*(1.f - bb)*(1 << W_BITS));
iw10 = round((1.f - aa)*bb*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
f32 errval = 0.f;
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;
const s16* Iptr = IWinBuf + y*IWinBufStride;
for( x = 0; x < wwcn; x++ )
{
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
W_BITS1-5) - Iptr[x];
errval += std::abs((f32)diff);
}
}
err[ptidx] = errval / (32*wwcn*winSize.height);
}
}
#else
(void)size;
(void)cn;
(void)prevData;
(void)prevStride;
(void)prevDerivData;
(void)prevDerivStride;
(void)nextData;
(void)nextStride;
(void)prevPts;
(void)nextPts;
(void)status;
(void)err;
(void)winSize;
(void)terminationCount;
(void)terminationEpsilon;
(void)level;
(void)maxLevel;
(void)useInitialFlow;
(void)getMinEigenVals;
(void)minEigThreshold;
(void)ptCount;
#endif
}
}//CAROTENE_NS

274
3rdparty/carotene/src/phase.cpp vendored Normal file
View File

@ -0,0 +1,274 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <cfloat>
#include <cmath>
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
#define FASTATAN2CONST(scale) \
f32 P1((f32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \
P3((f32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \
P5((f32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \
P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
A_90((f32)(90.f * scale)), \
A_180((f32)(180.f * scale)), \
A_360((f32)(360.f * scale)); \
float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
_90(vdupq_n_f32(A_90)), \
_180(vdupq_n_f32(A_180)), \
_360(vdupq_n_f32(A_360)), \
z(vdupq_n_f32(0.0f)), \
p1(vdupq_n_f32(P1)), \
p3(vdupq_n_f32(P3)), \
p5(vdupq_n_f32(P5)), \
p7(vdupq_n_f32(P7));
#define FASTATAN2SCALAR(y, x, a) \
{ \
f32 ax = std::abs(x), ay = std::abs(y); \
f32 c, c2; \
if (ax >= ay) \
{ \
c = ay / (ax + (float)DBL_EPSILON); \
c2 = c * c; \
a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
} \
else \
{ \
c = ax / (ay + (float)DBL_EPSILON); \
c2 = c * c; \
a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
} \
if (x < 0) \
a = A_180 - a; \
if (y < 0) \
a = A_360 - a; \
}
#define FASTATAN2VECTOR(v_y, v_x, a) \
{ \
float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
float32x4_t c2 = vmulq_f32(c, c); \
a = vmulq_f32(c2, p7); \
\
a = vmulq_f32(vaddq_f32(a, p5), c2); \
a = vmulq_f32(vaddq_f32(a, p3), c2); \
a = vmulq_f32(vaddq_f32(a, p1), c); \
\
a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
\
}
} // namespace
#endif
void phase(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
FASTATAN2CONST(256.0f / 360.0f)
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
float32x4_t v_05 = vdupq_n_f32(0.5f);
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
// 0
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4_t v_dst32f0;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
// 1
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
vmovn_u16(v_dst16s1)));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vld1q_s16(src1 + j);
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
float32x4_t v_dst32f0;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
vst1_u8(dst + j, vmovn_u16(v_dst));
}
for (; j < size.width; j++)
{
f32 x = src0[j], y = src1[j];
f32 a;
FASTATAN2SCALAR(y, x, a)
dst[j] = (u8)(s32)floor(a + 0.5f);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void phase(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
FASTATAN2CONST(scale)
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw8; j += 8)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
float32x4_t v_dst32f;
// 0
FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
vst1q_f32(dst + j, v_dst32f);
// 1
FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
vst1q_f32(dst + j + 4, v_dst32f);
}
if(j + 4 <= size.width)
{
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
float32x4_t v_dst32f;
FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
vst1q_f32(dst + j, v_dst32f);
j += 4;
}
for (; j < size.width; j++)
{
f32 a;
FASTATAN2SCALAR(src1[j], src0[j], a)
dst[j] = a;
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

1414
3rdparty/carotene/src/pyramid.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

460
3rdparty/carotene/src/reduce.cpp vendored Normal file
View File

@ -0,0 +1,460 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cstring>
namespace CAROTENE_NS {
void reduceColSum(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memset(dstBase, 0, size.width*sizeof(s32));
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
int32x4_t sll = vmovq_n_s32(0);
int32x4_t slh = vmovq_n_s32(0);
int32x4_t shl = vmovq_n_s32(0);
int32x4_t shh = vmovq_n_s32(0);
for (size_t h = 0; h < size.height; h += 256)
{
size_t lim = std::min(h + 256, size.height);
uint16x8_t sl = vmovq_n_u16(0);
uint16x8_t sh = vmovq_n_u16(0);
for (size_t k = h; k < lim; ++k, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v = vld1q_u8(src_address);
sl = vaddw_u8(sl, vget_low_u8(v));
sh = vaddw_u8(sh, vget_high_u8(v));
}
int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));
sll = vqaddq_s32(sll, vsll);
slh = vqaddq_s32(slh, vslh);
shl = vqaddq_s32(shl, vshl);
shh = vqaddq_s32(shh, vshh);
}
vst1q_s32(dstBase + i + 0, sll);
vst1q_s32(dstBase + i + 4, slh);
vst1q_s32(dstBase + i + 8, shl);
vst1q_s32(dstBase + i + 12, shh);
}
for(size_t h = 0; h < size.height; ++h)
{
for(size_t j = i ; j < size.width; j++ )
{
if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
dstBase[j] = 0x7fFFffFF;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMax(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width);
size_t i = 0;
for (; i + 16*4 <= size.width; i += 16*4)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address + 0);
uint8x16_t s2 = vld1q_u8(src_address + 16);
uint8x16_t s3 = vld1q_u8(src_address + 32);
uint8x16_t s4 = vld1q_u8(src_address + 48);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
internal::prefetch(src_address + srcStride, 32);
uint8x16_t v1 = vld1q_u8(src_address + 0);
uint8x16_t v2 = vld1q_u8(src_address + 16);
uint8x16_t v3 = vld1q_u8(src_address + 32);
uint8x16_t v4 = vld1q_u8(src_address + 48);
s1 = vmaxq_u8(s1, v1);
s2 = vmaxq_u8(s2, v2);
s3 = vmaxq_u8(s3, v3);
s4 = vmaxq_u8(s4, v4);
}
vst1q_u8(dstBase + i + 0, s1);
vst1q_u8(dstBase + i + 16, s2);
vst1q_u8(dstBase + i + 32, s3);
vst1q_u8(dstBase + i + 48, s4);
}
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v1 = vld1q_u8(src_address);
s1 = vmaxq_u8(s1, v1);
}
vst1q_u8(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMin(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width);
size_t i = 0;
for (; i + 16*4 <= size.width; i += 16*4)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address + 0);
uint8x16_t s2 = vld1q_u8(src_address + 16);
uint8x16_t s3 = vld1q_u8(src_address + 32);
uint8x16_t s4 = vld1q_u8(src_address + 48);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
internal::prefetch(src_address + srcStride, 32);
uint8x16_t v1 = vld1q_u8(src_address + 0);
uint8x16_t v2 = vld1q_u8(src_address + 16);
uint8x16_t v3 = vld1q_u8(src_address + 32);
uint8x16_t v4 = vld1q_u8(src_address + 48);
s1 = vminq_u8(s1, v1);
s2 = vminq_u8(s2, v2);
s3 = vminq_u8(s3, v3);
s4 = vminq_u8(s4, v4);
}
vst1q_u8(dstBase + i + 0, s1);
vst1q_u8(dstBase + i + 16, s2);
vst1q_u8(dstBase + i + 32, s3);
vst1q_u8(dstBase + i + 48, s4);
}
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v1 = vld1q_u8(src_address);
s1 = vminq_u8(s1, v1);
}
vst1q_u8(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColSum(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vaddq_f32(s1, v1);
s2 = vaddq_f32(s2, v2);
s3 = vaddq_f32(s3, v3);
s4 = vaddq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vaddq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
{
for(size_t j = i ; j < size.width; j++ )
{
dstBase[j] += srcBase[j + srcstep * h];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMax(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vmaxq_f32(s1, v1);
s2 = vmaxq_f32(s2, v2);
s3 = vmaxq_f32(s3, v3);
s4 = vmaxq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vmaxq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMin(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vminq_f32(s1, v1);
s2 = vminq_f32(s2, v2);
s3 = vminq_f32(s3, v3);
s4 = vminq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vminq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
} // namespace CAROTENE_NS

694
3rdparty/carotene/src/remap.cpp vendored Normal file
View File

@ -0,0 +1,694 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace internal {
void remapNearestNeighborReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride)
{
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
for (size_t x = 0; x < size.width; ++x)
{
dst_row[x] = srcBase[map_row[x]];
}
}
}
void remapNearestNeighborConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue)
{
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
for (size_t x = 0; x < size.width; ++x)
{
s32 src_idx = map_row[x];
dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
}
}
}
void remapLinearReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride)
{
int16x8_t v_zero16 = vdupq_n_s16(0);
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for ( ; x + 8 < size.width; x += 8)
{
int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
// first part
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
vget_low_s16(v_src00))), v_coeff.val[0]);
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
vget_low_s16(v_src10))), v_coeff.val[0]);
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
// second part
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
vget_high_s16(v_src00))), v_coeff.val[0]);
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
vget_high_s16(v_src10))), v_coeff.val[0]);
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
// store
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
}
for ( ; x < size.width; ++x)
{
s32 src00_index = map_row[(x << 2)];
s32 src10_index = map_row[(x << 2) + 2];
f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
srcBase[src00_index];
f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
srcBase[src10_index];
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
}
}
}
void remapLinearConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue)
{
int16x8_t v_zero16 = vdupq_n_s16(0);
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for ( ; x + 8 < size.width; x += 8)
{
int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
// first part
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
vget_low_s16(v_src00))), v_coeff.val[0]);
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
vget_low_s16(v_src10))), v_coeff.val[0]);
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
// second part
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
vget_high_s16(v_src00))), v_coeff.val[0]);
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
vget_high_s16(v_src10))), v_coeff.val[0]);
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
// store
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
}
for ( ; x < size.width; ++x)
{
s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
}
}
}
} // namespace internal
#endif // CAROTENE_NEON
bool isRemapNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isRemapLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * tableBase, ptrdiff_t tableStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
int32x2_t v_step2 = vdup_n_s32(srcStride);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
int32x2_t v_zero2 = vdup_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0;
for ( ; x + 8 <= blockWidth; x += 8)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x, v_dst_index);
v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x + 4, v_dst_index);
}
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x, v_dst_index);
}
for ( ; x + 2 <= blockWidth; x += 2)
{
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
vst1_s32(map_row + x, v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
int32x2_t v_m1_2 = vdup_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
float32x2_t v_zero2 = vdup_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0;
for ( ; x + 8 <= blockWidth; x += 8)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_dst_index);
v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x + 4, v_dst_index);
}
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_dst_index);
}
for ( ; x + 2 <= blockWidth; x += 2)
{
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
vst1_s32(map_row + x, v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)tableBase;
(void)tableStride;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void remapLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * tableBase, ptrdiff_t tableStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0;
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
f32 src_x_f = table_row[(x << 1) + 0];
f32 src_y_f = table_row[(x << 1) + 1];
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[x << 1] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0;
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
f32 src_x_f = table_row[(x << 1) + 0];
f32 src_y_f = table_row[(x << 1) + 1];
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1)] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)tableBase;
(void)tableStride;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

85
3rdparty/carotene/src/remap.hpp vendored Normal file
View File

@ -0,0 +1,85 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_REMAP_HPP
#define CAROTENE_SRC_REMAP_HPP
#include "common.hpp"
#include <cmath>
#ifdef CAROTENE_NEON
namespace CAROTENE_NS { namespace internal {
enum
{
BLOCK_SIZE = 32
};
void remapNearestNeighborReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride);
void remapNearestNeighborConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue);
void remapLinearReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride);
void remapLinearConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue);
} }
#endif // CAROTENE_NEON
#endif // CAROTENE_SRC_REMAP_HPP

2191
3rdparty/carotene/src/resize.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

199
3rdparty/carotene/src/saturate_cast.hpp vendored Normal file
View File

@ -0,0 +1,199 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SATURATE_CAST_HPP
#define CAROTENE_SATURATE_CAST_HPP
#include <algorithm>
#include <climits>
#include <cmath>
#if defined _MSC_VER && defined _M_ARM
# include <intrin.h>
#endif
#include <carotene/definitions.hpp>
#include <carotene/types.hpp>
namespace CAROTENE_NS { namespace internal {
#if defined _MSC_VER && defined _M_ARM
__declspec(naked) static void vcvtr_s32_f64_imp(f64 d)
{
(void)d;
__emit(0xEEBD); // vcvtr.s32.f64 s0, d0
__emit(0x0B40);
__emit(0xEE10); // vmov r0, s0
__emit(0x0A10);
__emit(0x4770); // bx lr
}
# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x);
# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x);
#elif defined CV_ICC || defined __GNUC__
# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__)
# define CAROTENE_ROUND_FLT(value) { \
register union { f32 f; s32 i; } result; \
asm ("ftosis %0, %1 \n" : "=w" (result.f) : "w" (value) ); \
return result.i; }
# define CAROTENE_ROUND_DBL(value) { \
register union {f32 f; s32 i;} __tegra_result; \
asm ( \
"ftosid %0, %P1\n" \
: "=w" (__tegra_result.f) \
: "w" (value) \
); \
return __tegra_result.i; \
}
# else
# define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value);
# define CAROTENE_ROUND_DBL(value) return (s32)lrint(value);
# endif
#endif
inline s32 round(f32 value)
{
#ifdef CAROTENE_ROUND_FLT
CAROTENE_ROUND_FLT(value)
#else
s32 intpart = (s32)(value);
f32 fractpart = value - intpart;
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
else
return intpart;
#endif
}
inline s32 round(f64 value)
{
#ifdef CAROTENE_ROUND_DBL
CAROTENE_ROUND_DBL(value)
#else
s32 intpart = (s32)(value);
f64 fractpart = value - intpart;
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
else
return intpart;
#endif
}
/////////////// saturate_cast (used in image & signal processing) ///////////////////
template<typename _Tp> inline _Tp saturate_cast(u8 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s8 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u16 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s16 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s64 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u64 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(f32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(f64 v) { return _Tp(v); }
template<> inline u8 saturate_cast<u8>(s8 v) { return (u8)std::max((s32)v, 0); }
template<> inline u8 saturate_cast<u8>(u16 v) { return (u8)std::min((u32)v, (u32)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(s32 v) { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline u8 saturate_cast<u8>(s16 v) { return saturate_cast<u8>((s32)v); }
template<> inline u8 saturate_cast<u8>(u32 v) { return (u8)std::min(v, (u32)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(s64 v) { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline u8 saturate_cast<u8>(u64 v) { return (u8)std::min(v, (u64)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(f32 v) { return saturate_cast<u8>(round(v)); }
template<> inline u8 saturate_cast<u8>(f64 v) { return saturate_cast<u8>(round(v)); }
template<> inline s8 saturate_cast<s8>(u8 v) { return (s8)std::min((s32)v, SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(u16 v) { return (s8)std::min((u32)v, (u32)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(s32 v) { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline s8 saturate_cast<s8>(s16 v) { return saturate_cast<s8>((s32)v); }
template<> inline s8 saturate_cast<s8>(u32 v) { return (s8)std::min(v, (u32)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(s64 v) { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline s8 saturate_cast<s8>(u64 v) { return (s8)std::min(v, (u64)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(f32 v) { return saturate_cast<s8>(round(v)); }
template<> inline s8 saturate_cast<s8>(f64 v) { return saturate_cast<s8>(round(v)); }
template<> inline u16 saturate_cast<u16>(s8 v) { return (u16)std::max((s32)v, 0); }
template<> inline u16 saturate_cast<u16>(s16 v) { return (u16)std::max((s32)v, 0); }
template<> inline u16 saturate_cast<u16>(s32 v) { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline u16 saturate_cast<u16>(u32 v) { return (u16)std::min(v, (u32)USHRT_MAX); }
template<> inline u16 saturate_cast<u16>(s64 v) { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline u16 saturate_cast<u16>(u64 v) { return (u16)std::min(v, (u64)USHRT_MAX); }
template<> inline u16 saturate_cast<u16>(f32 v) { return saturate_cast<u16>(round(v)); }
template<> inline u16 saturate_cast<u16>(f64 v) { return saturate_cast<u16>(round(v)); }
template<> inline s16 saturate_cast<s16>(u16 v) { return (s16)std::min((s32)v, SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(s32 v) { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline s16 saturate_cast<s16>(u32 v) { return (s16)std::min(v, (u32)SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(s64 v) { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline s16 saturate_cast<s16>(u64 v) { return (s16)std::min(v, (u64)SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(f32 v) { return saturate_cast<s16>(round(v)); }
template<> inline s16 saturate_cast<s16>(f64 v) { return saturate_cast<s16>(round(v)); }
template<> inline u32 saturate_cast<u32>(s8 v) { return (u32)std::max(v, (s8)0); }
template<> inline u32 saturate_cast<u32>(s16 v) { return (u32)std::max(v, (s16)0); }
template<> inline u32 saturate_cast<u32>(s32 v) { return (u32)std::max(v, (s32)0); }
template<> inline u32 saturate_cast<u32>(s64 v) { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
template<> inline u32 saturate_cast<u32>(u64 v) { return (u32)std::min(v, (u64)UINT_MAX); }
//OpenCV like f32/f64 -> u32 conversion
//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
template<> inline u32 saturate_cast<u32>(f32 v) { return round(v); }
template<> inline u32 saturate_cast<u32>(f64 v) { return round(v); }
//Negative clipping implementation
//template<> inline u32 saturate_cast<u32>(f32 v) { return saturate_cast<u32>(round(v)); }
//template<> inline u32 saturate_cast<u32>(f64 v) { return saturate_cast<u32>(round(v)); }
template<> inline s32 saturate_cast<s32>(u32 v) { return (s32)std::min(v, (u32)INT_MAX); }
template<> inline s32 saturate_cast<s32>(s64 v) { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
template<> inline s32 saturate_cast<s32>(u64 v) { return (s32)std::min(v, (u64)INT_MAX); }
template<> inline s32 saturate_cast<s32>(f32 v) { return round(v); }
template<> inline s32 saturate_cast<s32>(f64 v) { return round(v); }
template<> inline u64 saturate_cast<u64>(s8 v) { return (u64)std::max(v, (s8)0); }
template<> inline u64 saturate_cast<u64>(s16 v) { return (u64)std::max(v, (s16)0); }
template<> inline u64 saturate_cast<u64>(s32 v) { return (u64)std::max(v, (s32)0); }
template<> inline u64 saturate_cast<u64>(s64 v) { return (u64)std::max(v, (s64)0); }
template<> inline s64 saturate_cast<s64>(u64 v) { return (s64)std::min(v, (u64)LLONG_MAX); }
} }
#endif

219
3rdparty/carotene/src/scharr.cpp vendored Normal file
View File

@ -0,0 +1,219 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <vector>
#include "common.hpp"
namespace CAROTENE_NS {
bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
{
return (dx == 0 && dy == 1 &&
isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
(dx == 1 && dy == 0 &&
isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
}
void Scharr3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
#ifdef CAROTENE_NEON
static s16 dw[] = {3, 10, 3};
if (dy == 1)
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
3, 1, dw, 0,
border, borderValue, borderMargin);
else
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
1, 3, 0, dw,
border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
void ScharrDeriv(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t colsn = size.width*cn;
size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
std::vector<s16> _tempBuf((delta << 1) + 64);
s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
int16x8_t vc3 = vmovq_n_s16(3);
int16x8_t vc10 = vmovq_n_s16(10);
uint8x8_t v8c10 = vmov_n_u8(10);
for(size_t y = 0; y < size.height; y++ )
{
const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
// do vertical convolution
size_t x = 0;
for( ; x < roiw8; x += 8 )
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
#if __GNUC_MINOR__ < 7
__asm__ (
"vld1.8 {d0}, [%[src0]] \n\t"
"vld1.8 {d2}, [%[src2]] \n\t"
"vld1.8 {d1}, [%[src1]] \n\t"
"vaddl.u8 q2, d2, d0 \n\t"
"vmull.u8 q3, d1, %[vc10] \n\t"
"vsubl.u8 q4, d2, d0 \n\t"
"vmla.s16 q3, q2, %q[vc3] \n\t"
"vst1.16 {d8-d9}, [%[out1],:128] \n\t"
"vst1.16 {d6-d7}, [%[out0],:128] \n\t"
:
: [out0] "r" (trow0 + x),
[out1] "r" (trow1 + x),
[src0] "r" (srow0 + x),
[src1] "r" (srow1 + x),
[src2] "r" (srow2 + x),
[vc10] "w" (v8c10), [vc3] "w" (vc3)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
#else
uint8x8_t s0 = vld1_u8(srow0 + x);
uint8x8_t s1 = vld1_u8(srow1 + x);
uint8x8_t s2 = vld1_u8(srow2 + x);
int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
vst1q_s16(trow1 + x, t1);
vst1q_s16(trow0 + x, t0);
#endif
}
for( ; x < colsn; x++ )
{
trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
trow1[x] = (s16)(srow2[x] - srow0[x]);
}
// make border
size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
for( s32 k = 0; k < cn; k++ )
{
trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
}
// do horizontal convolution, interleave the results and store them to dst
x = 0;
for( ; x < roiw8; x += 8 )
{
#if __GNUC_MINOR__ < 6
__asm__ (
"vld1.16 {d4-d5}, [%[s2ptr]] \n\t"
"vld1.16 {d8-d9}, [%[s4ptr]] \n\t"
"vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t"
"vld1.16 {d0-d1}, [%[s0ptr]] \n\t"
"vld1.16 {d2-d3}, [%[s1ptr]] \n\t"
"vadd.i16 q7, q2, q4 \n\t"
"vmul.s16 q6, q3, %q[vc10] \n\t"
"vsub.s16 q5, q1, q0 \n\t"
"vmla.s16 q6, q7, %q[vc3] \n\t"
"vst2.16 {d10-d13}, [%[out]] \n\t"
:
: [out] "r" (drow + x * 2),
[s0ptr] "r" (trow0 + x - cn),
[s1ptr] "r" (trow0 + x + cn),
[s2ptr] "r" (trow1 + x - cn),
[s3ptr] "r" (trow1 + x),
[s4ptr] "r" (trow1 + x + cn),
[vc10] "w" (vc10), [vc3] "w" (vc3)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
#else
int16x8_t s0 = vld1q_s16(trow0 + x - cn);
int16x8_t s1 = vld1q_s16(trow0 + x + cn);
int16x8_t s2 = vld1q_s16(trow1 + x - cn);
int16x8_t s3 = vld1q_s16(trow1 + x);
int16x8_t s4 = vld1q_s16(trow1 + x + cn);
int16x8_t s3x10 = vmulq_s16(s3, vc10);
int16x8_t s24 = vaddq_s16(s2, s4);
int16x8x2_t vr;
vr.val[0] = vsubq_s16(s1, s0);
vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
vst2q_s16(drow + x*2, vr);
#endif //__GNUC_MINOR__ < 6
}
for( ; x < colsn; x++ )
{
drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
}
}
#else
(void)size;
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,109 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "separable_filter.hpp"
namespace CAROTENE_NS {
bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
{
return isSupportedConfiguration() &&
size.width >= 9 && size.height >= 1 &&
(size.height + borderMargin.top + borderMargin.bottom) >= 2 &&
(dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE );
}
void SeparableFilter3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin));
#ifdef CAROTENE_NEON
if(!((xw || rowFilter < 3) && (yw || colFilter < 3)))
std::abort();//Couldn't call generic filter without provided weights
typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t,
const s16*, const s16*, BORDER_MODE, u8, Margin);
static sepFilter3x3_8u16s_func quickFilters[4][4]=
{
/*d0y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_121>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_121>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_121>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_121>::process},
/*dy */{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_m101>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_m101>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_m101>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_m101>::process},
/*d2y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_1m21>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_1m21>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_1m21>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_1m21>::process},
/*dNy*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16Generic>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16Generic>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16Generic>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16Generic>::process}
};
quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride,
xw, yw, border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)xw;
(void)yw;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

1161
3rdparty/carotene/src/separable_filter.hpp vendored Normal file

File diff suppressed because it is too large Load Diff

317
3rdparty/carotene/src/sobel.cpp vendored Normal file
View File

@ -0,0 +1,317 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <vector>
#include "common.hpp"
namespace CAROTENE_NS {
bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border,
s32 dx, s32 dy, Margin borderMargin)
{
return dx < 3 && dx >= 0 &&
dy < 3 && dy >= 0 &&
(dx + dy) > 0 &&
isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin);
}
void Sobel3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin));
#ifdef CAROTENE_NEON
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
dx, dy, 0, 0,
borderType, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border,
s32 dx, s32 dy)
{
return isSupportedConfiguration() &&
dx < 3 && dx >= 0 &&
dy < 3 && dy >= 0 &&
(dx + dy) > 0 &&
size.width >= 4 && size.height >= 2 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE );
}
void Sobel3x3(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE borderType, f32 borderValue)
{
internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy));
#ifdef CAROTENE_NEON
std::vector<f32> _tmp;
f32 *tmp = 0;
if (borderType == BORDER_MODE_CONSTANT)
{
_tmp.assign(size.width + 2, borderValue);
tmp = &_tmp[1];
}
ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size
std::vector<f32> _tempBuf((delta << 1) + 64);
f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32);
for( size_t y = 0; y < size.height; y++ )
{
const f32* srow0;
const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y);
const f32* srow2;
f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0);
f32* drow1 = internal::getRowPtr(dstBase, dstStride, y);
if (borderType == BORDER_MODE_REFLECT101) {
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
} else if (borderType == BORDER_MODE_CONSTANT) {
srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
}
float32x4_t tprev = vmovq_n_f32(0.f);
float32x4_t tcurr = vmovq_n_f32(0.f);
float32x4_t tnext = vmovq_n_f32(0.f);
float32x4_t t0, t1, t2;
// do vertical convolution
size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4);
for( ; x <= bcolsn; x += 4 )
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
float32x4_t x0 = vld1q_f32(srow0 + x);
float32x4_t x1 = vld1q_f32(srow1 + x);
float32x4_t x2 = vld1q_f32(srow2 + x);
tprev = tcurr;
tcurr = tnext;
if(!dy)
{
tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0);
}
else if(dy == 2)
{
tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0));
}
else
{
tnext = vsubq_f32(x2, x0);
}
if(!x) {
tcurr = tnext;
// make border
if (borderType == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_f32(borderValue,tcurr, 3);
}
else if (borderType == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3);
}
else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
{
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3);
}
continue;
}
internal::prefetch(trow0 + x);
internal::prefetch(trow1 + x);
t0 = vextq_f32(tprev, tcurr, 3);
t1 = tcurr;
t2 = vextq_f32(tcurr, tnext, 1);
if(!dx)
{
t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2));
}
else if(dx == 2)
{
t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0));
}
else
{
t0 = vsubq_f32(t2, t0);
}
if(!(y%2))
{
vst1q_f32(trow0 + x - 4, t0);
}
else
{
vst1q_f32(trow1 + x - 4, t0);
}
}
x -= 4;
if(x == size.width){
x--;
}
f32 prevx = 0, rowx = 0, nextx = 0;
if(!dy)
{
prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 4*borderValue :
srow2[0] + 2*srow1[0] + srow0[0]) );
rowx = srow2[x] + 2*srow1[x] + srow0[x];
}
else if(dy == 2)
{
prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 0.f :
srow2[0] - 2*srow1[0] + srow0[0]) );
rowx = srow2[x] - 2*srow1[x] + srow0[x];
}
else
{
prevx = x > 0 ? srow2[x-1] - srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 0.f :
srow2[0] - srow0[0]) );
rowx = srow2[x] - srow0[x];
}
for( ; x < size.width; x++ )
{
if(x+1 == size.width) {
// make border
if (borderType == BORDER_MODE_CONSTANT)
{
if(!dy) {
nextx = 4*borderValue;
} else {
nextx = 0.f;
}
} else if (borderType == BORDER_MODE_REFLECT101)
{
if(!dy) {
nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1];
} else if(dy == 2) {
nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1];
} else {
nextx = srow2[x-1] - srow0[x-1];
}
} else {
if(!dy) {
nextx = srow2[x] + 2*srow1[x] + srow0[x];
} else if(dy == 2) {
nextx = srow2[x] - 2*srow1[x] + srow0[x];
} else {
nextx = srow2[x] - srow0[x];
}
}
} else {
if(!dy) {
nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1];
} else if(dy == 2) {
nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1];
} else {
nextx = srow2[x+1] - srow0[x+1];
}
}
f32 res;
if(dx==1) {
res = nextx - prevx;
} else if(!dx) {
res = prevx + 2*rowx + nextx;
} else {
res = prevx - 2*rowx + nextx;
}
if(!(y%2)) {
*(trow0+x) = res;
} else {
*(trow1+x) = res;
}
prevx = rowx;
rowx = nextx;
}
if(y>0) {
for(size_t x1 = 0; x1 < size.width; x1++ )
{
if(y%2)
*(drow + x1) = trow0[x1];
else
*(drow + x1) = trow1[x1];
}
}
if(y == size.height-1) {
for(size_t x1 = 0; x1 < size.width; x1++ )
{
if(!(y%2))
*(drow1 + x1) = trow0[x1];
else
*(drow1 + x1) = trow1[x1];
}
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

621
3rdparty/carotene/src/sub.cpp vendored Normal file
View File

@ -0,0 +1,621 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct SubWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vsubq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vsub(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] - (WT)src1[0]);
}
};
template <typename T, typename WT>
struct SubSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqsubq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqsub(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] - (WT)src1[0]);
}
};
} // namespace
#endif
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (s16)src0[j] - (s16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
f32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00), vget_low_u8(v_src10)));
int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
vst1q_f32(dst + j + 8, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01), vget_low_u8(v_src11)));
vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1));
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vs) )));
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) )));
}
for(; j < size.width; j++)
dst[j] = (f32)src0[j] - (f32)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
uint8x16_t v_src1 = vld1q_u8(src1 + j);
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
uint8x16_t v_src1 = vld1q_u8(src1 + j);
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

385
3rdparty/carotene/src/sum.cpp vendored Normal file
View File

@ -0,0 +1,385 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
bool isSumSupported(u32 channels)
{
return (channels && channels < 5);
}
void sum(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const ptrdiff_t width = size.width * channels;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
ptrdiff_t i = 0;
if (channels == 3)
{
uint32x4_t vs1231 = vdupq_n_u32(0);
uint32x4_t vs3123 = vdupq_n_u32(0);
uint32x4_t vs2312 = vdupq_n_u32(0);
for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3)
{
internal::prefetch(src + j + 24);
s1 = vaddw_u8(s1, vld1_u8(src + j + 0));
s2 = vaddw_u8(s2, vld1_u8(src + j + 8));
s3 = vaddw_u8(s3, vld1_u8(src + j + 16));
}
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
}
if (i <= width - 8*3)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3)
{
internal::prefetch(src + 24);
s1 = vaddw_u8(s1, vld1_u8(src + 0));
s2 = vaddw_u8(s2, vld1_u8(src + 8));
s3 = vaddw_u8(s3, vld1_u8(src + 16));
}
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
}
u32 sum[12];
vst1q_u32(sum+0, vs1231);
vst1q_u32(sum+4, vs2312);
vst1q_u32(sum+8, vs3123);
for (; i < width; i += 3, src += 3)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
sumdst[2] += src[2];
}
sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9];
sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10];
sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11];
}
else
{
uint32x4_t vs = vdupq_n_u32(0);
for (; i <= width - 257*8; i += 257*8, src += 257 * 8)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
for (int j = 8; j < 257 * 8; j += 8)
{
internal::prefetch(src + j);
s1 = vaddw_u8(s1, vld1_u8(src + j));
}
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
}
if (i < width - 7)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
for(i+=8,src+=8; i < width-7; i+=8,src+=8)
{
internal::prefetch(src);
s1 = vaddw_u8(s1, vld1_u8(src));
}
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
}
if (channels == 1)
{
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2));
u32 s0 = vget_lane_u32(vs1, 0);
for(; i < width; ++i,++src)
s0 += src[0];
sumdst[0] += s0;
}
else if (channels == 4)
{
vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst)));
for(; i < width; i+=4,src+=4)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
sumdst[2] += src[2];
sumdst[3] += src[3];
}
}
else//if (channels == 2)
{
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst)));
for(; i < width; i+=2,src+=2)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
}
}
}//channels != 3
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)channels;
#endif
}
void sum(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride,
f64 * sumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const ptrdiff_t width = size.width * channels;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
ptrdiff_t i = 0;
if (channels == 3)
{
float32x4_t vs1231 = vdupq_n_f32(0);
float32x4_t vs2312 = vdupq_n_f32(0);
float32x4_t vs3123 = vdupq_n_f32(0);
for(; i <= width-12; i += 12)
{
internal::prefetch(src + i + 12);
vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0));
vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4));
vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8));
}
f32 s[12];
vst1q_f32(s + 0, vs1231);
vst1q_f32(s + 4, vs2312);
vst1q_f32(s + 8, vs3123);
sumdst[0] += s[0] + s[3] + s[6] + s[9];
sumdst[1] += s[1] + s[4] + s[7] + s[10];
sumdst[2] += s[2] + s[5] + s[8] + s[11];
for( ; i < width; i+=3)
{
sumdst[0] += src[i];
sumdst[1] += src[i+1];
sumdst[2] += src[i+2];
}
}
else
{
float32x4_t vs = vdupq_n_f32(0);
for(; i <= width-4; i += 4)
{
internal::prefetch(src + i);
vs = vaddq_f32(vs, vld1q_f32(src+i));
}
if (channels == 1)
{
float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs));
f32 s[2];
vst1_f32(s, vs2);
sumdst[0] += s[0] + s[1];
for( ; i < width; i++)
sumdst[0] += src[i];
}
else if (channels == 4)
{
f32 s[4];
vst1q_f32(s, vs);
sumdst[0] += s[0];
sumdst[1] += s[1];
sumdst[2] += s[2];
sumdst[3] += s[3];
}
else//if (channels == 2)
{
float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs));
f32 s[2];
vst1_f32(s, vs2);
sumdst[0] += s[0];
sumdst[1] += s[1];
if(i < width)
{
sumdst[0] += src[i];
sumdst[1] += src[i+1];
}
}
}//channels != 3
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)channels;
#endif
}
bool isSqsumSupported(u32 channels)
{
return (channels && ((4/channels)*channels == 4));
}
void sqsum(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sumdst, f64 * sqsumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSqsumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width*channels))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width * channels;
size_t blockSize0 = 1 << 23;
size_t roiw8 = width & ~7;
uint32x4_t v_zero = vdupq_n_u32(0u);
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw8)
{
size_t blockSize = std::min(roiw8 - j, blockSize0) + j;
uint32x4_t v_sum = v_zero;
uint32x4_t v_sqsum = v_zero;
for ( ; j < blockSize ; j += 8, src += 8)
{
internal::prefetch(src);
uint8x8_t v_src0 = vld1_u8(src);
uint16x8_t v_src = vmovl_u8(v_src0);
uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src);
v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi));
v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo);
v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi);
}
u32 arsum[8];
vst1q_u32(arsum, v_sum);
vst1q_u32(arsum + 4, v_sqsum);
sumdst[0] += (f64)arsum[0];
sumdst[1 % channels] += (f64)arsum[1];
sumdst[2 % channels] += (f64)arsum[2];
sumdst[3 % channels] += (f64)arsum[3];
sqsumdst[0] += (f64)arsum[4];
sqsumdst[1 % channels] += (f64)arsum[5];
sqsumdst[2 % channels] += (f64)arsum[6];
sqsumdst[3 % channels] += (f64)arsum[7];
}
// collect a few last elements in the current row
// it's ok to process channels elements per step
// since we could handle 1,2 or 4 channels
// we always have channels-fold amount of elements remaining
for ( ; j < width; j+=channels, src+=channels)
{
for (u32 kk = 0; kk < channels; kk++)
{
u32 srcval = src[kk];
sumdst[kk] += srcval;
sqsumdst[kk] += srcval * srcval;
}
}
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)sqsumdst;
(void)channels;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,241 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than
//time for simultaneous 2 lines matching for the same amount of data
bool isMatchTemplateSupported(const Size2D &tmplSize)
{
return isSupportedConfiguration() &&
tmplSize.width >= 8 && // Actually the function could process even shorter templates
// but there will be no NEON optimization in this case
(tmplSize.width * tmplSize.height) <= 256;
}
void matchTemplate(const Size2D &srcSize,
const u8 * srcBase, ptrdiff_t srcStride,
const Size2D &tmplSize,
const u8 * tmplBase, ptrdiff_t tmplStride,
f32 * dstBase, ptrdiff_t dstStride,
bool normalize)
{
internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize));
#ifdef CAROTENE_NEON
const size_t tmplW = tmplSize.width;
const size_t tmplH = tmplSize.height;
const size_t dstW = srcSize.width - tmplSize.width + 1;
const size_t dstH = srcSize.height - tmplSize.height + 1;
//template correlation part
{
#if ENABLE4LINESMATCHING
const size_t dstroiw4 = dstW & ~3u;
#endif
const size_t dstroiw2 = dstW & ~1u;
const size_t tmplroiw = tmplW & ~7u;
const size_t dstride = dstStride >> 2;
f32 *corr = dstBase;
const u8 *imgrrow = srcBase;
for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
{
size_t c = 0;
#if ENABLE4LINESMATCHING
for(; c < dstroiw4; c+=4)
{
u32 dot[4] = {0, 0, 0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
uint32x4_t vdot2 = vmovq_n_u32(0);
uint32x4_t vdot3 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
uint8x8_t vimg3 = vld1_u8(img + j + c + 3);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
vdot2 = vpadalq_u16(vdot2, vd2);
vdot3 = vpadalq_u16(vdot3, vd3);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
dot[2] += tmpl[j] * img[j + c + 2];
dot[3] += tmpl[j] * img[j + c + 3];
}
}
uint32x4_t vdotx = vld1q_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_2 = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
uint32x2_t vdot_3 = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);
vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
}
#endif
for(; c < dstroiw2; c+=2)
{
u32 dot[2] = {0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
}
}
uint32x2_t vdotx = vld1_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_ = vpadd_u32(vdot_0, vdot_1);
vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
}
for(; c < dstW; ++c)
{
u32 dot = 0;
uint32x4_t vdot = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg = vld1_u8(img + j + c);
uint16x8_t vd = vmull_u8(vtmpl, vimg);
vdot = vpadalq_u16(vdot, vd);
}
for(; j < tmplW; ++j)
dot += tmpl[j] * img[j + c];
}
u32 wdot[2];
vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
dot += wdot[0] + wdot[1];
corr[c] = (f32)dot;
}
}
}
if(normalize)
{
f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
size_t iw = srcSize.width+1;
size_t ih = srcSize.height+1;
std::vector<f64> _sqsum(iw*ih);
f64 *sqsum = &_sqsum[0];
memset(sqsum, 0, iw*sizeof(f64));
for(size_t i = 1; i < ih; ++i)
sqsum[iw*i] = 0.;
sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));
for(size_t i = 0; i < dstH; ++i)
{
f32 *result = internal::getRowPtr(dstBase, dstStride, i);
for(size_t j = 0; j < dstW; ++j)
{
double s2 = sqsum[iw*i + j] +
sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
sqsum[iw*(i + tmplSize.height) + j] -
sqsum[iw*i + j + tmplSize.width];
result[j] /= tn * std::sqrt(s2);
}
}
}
#else
(void)srcSize;
(void)srcBase;
(void)srcStride;
(void)tmplBase;
(void)tmplStride;
(void)dstBase;
(void)dstStride;
(void)normalize;
#endif
}
} // namespace CAROTENE_NS

1627
3rdparty/carotene/src/threshold.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

689
3rdparty/carotene/src/vtransform.hpp vendored Normal file
View File

@ -0,0 +1,689 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_VTRANSFORM_HPP
#define CAROTENE_SRC_VTRANSFORM_HPP
#include "common.hpp"
#include <carotene/types.hpp>
#ifdef CAROTENE_NEON
namespace CAROTENE_NS { namespace internal {
////////////////////////////// Type Traits ///////////////////////
template <typename T, int cn = 1>
struct VecTraits;
template <> struct VecTraits< u8, 1> { typedef uint8x16_t vec128; typedef uint8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
template <> struct VecTraits< s8, 1> { typedef int8x16_t vec128; typedef int8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
template <> struct VecTraits<u16, 1> { typedef uint16x8_t vec128; typedef uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s16, 1> { typedef int16x8_t vec128; typedef int16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s32, 1> { typedef int32x4_t vec128; typedef int32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<u32, 1> { typedef uint32x4_t vec128; typedef uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<s64, 1> { typedef int64x2_t vec128; typedef int64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<u64, 1> { typedef uint64x2_t vec128; typedef uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<f32, 1> { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits< u8, 2> { typedef uint8x16x2_t vec128; typedef uint8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
template <> struct VecTraits< s8, 2> { typedef int8x16x2_t vec128; typedef int8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
template <> struct VecTraits<u16, 2> { typedef uint16x8x2_t vec128; typedef uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s16, 2> { typedef int16x8x2_t vec128; typedef int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s32, 2> { typedef int32x4x2_t vec128; typedef int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<u32, 2> { typedef uint32x4x2_t vec128; typedef uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<s64, 2> { typedef int64x2x2_t vec128; typedef int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 2> { typedef uint64x2x2_t vec128; typedef uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 2> { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits< u8, 3> { typedef uint8x16x3_t vec128; typedef uint8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits< s8, 3> { typedef int8x16x3_t vec128; typedef int8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits<u16, 3> { typedef uint16x8x3_t vec128; typedef uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 3> { typedef int16x8x3_t vec128; typedef int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 3> { typedef int32x4x3_t vec128; typedef int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 3> { typedef uint32x4x3_t vec128; typedef uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 3> { typedef int64x2x3_t vec128; typedef int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 3> { typedef uint64x2x3_t vec128; typedef uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 3> { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits< u8, 4> { typedef uint8x16x4_t vec128; typedef uint8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits< s8, 4> { typedef int8x16x4_t vec128; typedef int8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits<u16, 4> { typedef uint16x8x4_t vec128; typedef uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 4> { typedef int16x8x4_t vec128; typedef int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 4> { typedef int32x4x4_t vec128; typedef int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 4> { typedef uint32x4x4_t vec128; typedef uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 4> { typedef int64x2x4_t vec128; typedef int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 4> { typedef uint64x2x4_t vec128; typedef uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 4> { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
////////////////////////////// vld1q ///////////////////////
inline uint8x16_t vld1q(const u8 * ptr) { return vld1q_u8(ptr); }
inline int8x16_t vld1q(const s8 * ptr) { return vld1q_s8(ptr); }
inline uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }
inline int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); }
inline uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }
inline int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); }
////////////////////////////// vld1 ///////////////////////
inline uint8x8_t vld1(const u8 * ptr) { return vld1_u8(ptr); }
inline int8x8_t vld1(const s8 * ptr) { return vld1_s8(ptr); }
inline uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); }
inline int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); }
inline uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); }
inline int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }
////////////////////////////// vld2q ///////////////////////
inline uint8x16x2_t vld2q(const u8 * ptr) { return vld2q_u8(ptr); }
inline int8x16x2_t vld2q(const s8 * ptr) { return vld2q_s8(ptr); }
inline uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); }
inline int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); }
inline uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); }
inline int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); }
inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); }
////////////////////////////// vld2 ///////////////////////
inline uint8x8x2_t vld2(const u8 * ptr) { return vld2_u8(ptr); }
inline int8x8x2_t vld2(const s8 * ptr) { return vld2_s8(ptr); }
inline uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); }
inline int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
inline uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); }
inline int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); }
inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); }
////////////////////////////// vld3q ///////////////////////
inline uint8x16x3_t vld3q(const u8 * ptr) { return vld3q_u8(ptr); }
inline int8x16x3_t vld3q(const s8 * ptr) { return vld3q_s8(ptr); }
inline uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); }
inline int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); }
inline uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); }
inline int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); }
inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); }
////////////////////////////// vld3 ///////////////////////
inline uint8x8x3_t vld3(const u8 * ptr) { return vld3_u8(ptr); }
inline int8x8x3_t vld3(const s8 * ptr) { return vld3_s8(ptr); }
inline uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); }
inline int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); }
inline uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); }
inline int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); }
inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); }
////////////////////////////// vld4q ///////////////////////
inline uint8x16x4_t vld4q(const u8 * ptr) { return vld4q_u8(ptr); }
inline int8x16x4_t vld4q(const s8 * ptr) { return vld4q_s8(ptr); }
inline uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); }
inline int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); }
inline uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); }
inline int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); }
inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); }
////////////////////////////// vld4 ///////////////////////
inline uint8x8x4_t vld4(const u8 * ptr) { return vld4_u8(ptr); }
inline int8x8x4_t vld4(const s8 * ptr) { return vld4_s8(ptr); }
inline uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); }
inline int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); }
inline uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); }
inline int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); }
inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); }
////////////////////////////// vst1q ///////////////////////
inline void vst1q(u8 * ptr, const uint8x16_t & v) { return vst1q_u8(ptr, v); }
inline void vst1q(s8 * ptr, const int8x16_t & v) { return vst1q_s8(ptr, v); }
inline void vst1q(u16 * ptr, const uint16x8_t & v) { return vst1q_u16(ptr, v); }
inline void vst1q(s16 * ptr, const int16x8_t & v) { return vst1q_s16(ptr, v); }
inline void vst1q(u32 * ptr, const uint32x4_t & v) { return vst1q_u32(ptr, v); }
inline void vst1q(s32 * ptr, const int32x4_t & v) { return vst1q_s32(ptr, v); }
inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
////////////////////////////// vst1 ///////////////////////
inline void vst1(u8 * ptr, const uint8x8_t & v) { return vst1_u8(ptr, v); }
inline void vst1(s8 * ptr, const int8x8_t & v) { return vst1_s8(ptr, v); }
inline void vst1(u16 * ptr, const uint16x4_t & v) { return vst1_u16(ptr, v); }
inline void vst1(s16 * ptr, const int16x4_t & v) { return vst1_s16(ptr, v); }
inline void vst1(u32 * ptr, const uint32x2_t & v) { return vst1_u32(ptr, v); }
inline void vst1(s32 * ptr, const int32x2_t & v) { return vst1_s32(ptr, v); }
inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }
////////////////////////////// vst2q ///////////////////////
inline void vst2q(u8 * ptr, const uint8x16x2_t & v) { return vst2q_u8(ptr, v); }
inline void vst2q(s8 * ptr, const int8x16x2_t & v) { return vst2q_s8(ptr, v); }
inline void vst2q(u16 * ptr, const uint16x8x2_t & v) { return vst2q_u16(ptr, v); }
inline void vst2q(s16 * ptr, const int16x8x2_t & v) { return vst2q_s16(ptr, v); }
inline void vst2q(u32 * ptr, const uint32x4x2_t & v) { return vst2q_u32(ptr, v); }
inline void vst2q(s32 * ptr, const int32x4x2_t & v) { return vst2q_s32(ptr, v); }
inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); }
////////////////////////////// vst2 ///////////////////////
inline void vst2(u8 * ptr, const uint8x8x2_t & v) { return vst2_u8(ptr, v); }
inline void vst2(s8 * ptr, const int8x8x2_t & v) { return vst2_s8(ptr, v); }
inline void vst2(u16 * ptr, const uint16x4x2_t & v) { return vst2_u16(ptr, v); }
inline void vst2(s16 * ptr, const int16x4x2_t & v) { return vst2_s16(ptr, v); }
inline void vst2(u32 * ptr, const uint32x2x2_t & v) { return vst2_u32(ptr, v); }
inline void vst2(s32 * ptr, const int32x2x2_t & v) { return vst2_s32(ptr, v); }
inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); }
////////////////////////////// vst3q ///////////////////////
inline void vst3q(u8 * ptr, const uint8x16x3_t & v) { return vst3q_u8(ptr, v); }
inline void vst3q(s8 * ptr, const int8x16x3_t & v) { return vst3q_s8(ptr, v); }
inline void vst3q(u16 * ptr, const uint16x8x3_t & v) { return vst3q_u16(ptr, v); }
inline void vst3q(s16 * ptr, const int16x8x3_t & v) { return vst3q_s16(ptr, v); }
inline void vst3q(u32 * ptr, const uint32x4x3_t & v) { return vst3q_u32(ptr, v); }
inline void vst3q(s32 * ptr, const int32x4x3_t & v) { return vst3q_s32(ptr, v); }
inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); }
////////////////////////////// vst3 ///////////////////////
inline void vst3(u8 * ptr, const uint8x8x3_t & v) { return vst3_u8(ptr, v); }
inline void vst3(s8 * ptr, const int8x8x3_t & v) { return vst3_s8(ptr, v); }
inline void vst3(u16 * ptr, const uint16x4x3_t & v) { return vst3_u16(ptr, v); }
inline void vst3(s16 * ptr, const int16x4x3_t & v) { return vst3_s16(ptr, v); }
inline void vst3(u32 * ptr, const uint32x2x3_t & v) { return vst3_u32(ptr, v); }
inline void vst3(s32 * ptr, const int32x2x3_t & v) { return vst3_s32(ptr, v); }
inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); }
////////////////////////////// vst4q ///////////////////////
inline void vst4q(u8 * ptr, const uint8x16x4_t & v) { return vst4q_u8(ptr, v); }
inline void vst4q(s8 * ptr, const int8x16x4_t & v) { return vst4q_s8(ptr, v); }
inline void vst4q(u16 * ptr, const uint16x8x4_t & v) { return vst4q_u16(ptr, v); }
inline void vst4q(s16 * ptr, const int16x8x4_t & v) { return vst4q_s16(ptr, v); }
inline void vst4q(u32 * ptr, const uint32x4x4_t & v) { return vst4q_u32(ptr, v); }
inline void vst4q(s32 * ptr, const int32x4x4_t & v) { return vst4q_s32(ptr, v); }
inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); }
////////////////////////////// vst4 ///////////////////////
inline void vst4(u8 * ptr, const uint8x8x4_t & v) { return vst4_u8(ptr, v); }
inline void vst4(s8 * ptr, const int8x8x4_t & v) { return vst4_s8(ptr, v); }
inline void vst4(u16 * ptr, const uint16x4x4_t & v) { return vst4_u16(ptr, v); }
inline void vst4(s16 * ptr, const int16x4x4_t & v) { return vst4_s16(ptr, v); }
inline void vst4(u32 * ptr, const uint32x2x4_t & v) { return vst4_u32(ptr, v); }
inline void vst4(s32 * ptr, const int32x2x4_t & v) { return vst4_s32(ptr, v); }
inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); }
////////////////////////////// vabdq ///////////////////////
inline uint8x16_t vabdq(const uint8x16_t & v0, const uint8x16_t & v1) { return vabdq_u8 (v0, v1); }
inline int8x16_t vabdq(const int8x16_t & v0, const int8x16_t & v1) { return vabdq_s8 (v0, v1); }
inline uint16x8_t vabdq(const uint16x8_t & v0, const uint16x8_t & v1) { return vabdq_u16(v0, v1); }
inline int16x8_t vabdq(const int16x8_t & v0, const int16x8_t & v1) { return vabdq_s16(v0, v1); }
inline uint32x4_t vabdq(const uint32x4_t & v0, const uint32x4_t & v1) { return vabdq_u32(v0, v1); }
inline int32x4_t vabdq(const int32x4_t & v0, const int32x4_t & v1) { return vabdq_s32(v0, v1); }
inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); }
////////////////////////////// vabd ///////////////////////
inline uint8x8_t vabd(const uint8x8_t & v0, const uint8x8_t & v1) { return vabd_u8 (v0, v1); }
inline int8x8_t vabd(const int8x8_t & v0, const int8x8_t & v1) { return vabd_s8 (v0, v1); }
inline uint16x4_t vabd(const uint16x4_t & v0, const uint16x4_t & v1) { return vabd_u16(v0, v1); }
inline int16x4_t vabd(const int16x4_t & v0, const int16x4_t & v1) { return vabd_s16(v0, v1); }
inline uint32x2_t vabd(const uint32x2_t & v0, const uint32x2_t & v1) { return vabd_u32(v0, v1); }
inline int32x2_t vabd(const int32x2_t & v0, const int32x2_t & v1) { return vabd_s32(v0, v1); }
inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); }
////////////////////////////// vminq ///////////////////////
inline uint8x16_t vminq(const uint8x16_t & v0, const uint8x16_t & v1) { return vminq_u8 (v0, v1); }
inline int8x16_t vminq(const int8x16_t & v0, const int8x16_t & v1) { return vminq_s8 (v0, v1); }
inline uint16x8_t vminq(const uint16x8_t & v0, const uint16x8_t & v1) { return vminq_u16(v0, v1); }
inline int16x8_t vminq(const int16x8_t & v0, const int16x8_t & v1) { return vminq_s16(v0, v1); }
inline uint32x4_t vminq(const uint32x4_t & v0, const uint32x4_t & v1) { return vminq_u32(v0, v1); }
inline int32x4_t vminq(const int32x4_t & v0, const int32x4_t & v1) { return vminq_s32(v0, v1); }
inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
////////////////////////////// vmin ///////////////////////
inline uint8x8_t vmin(const uint8x8_t & v0, const uint8x8_t & v1) { return vmin_u8 (v0, v1); }
inline int8x8_t vmin(const int8x8_t & v0, const int8x8_t & v1) { return vmin_s8 (v0, v1); }
inline uint16x4_t vmin(const uint16x4_t & v0, const uint16x4_t & v1) { return vmin_u16(v0, v1); }
inline int16x4_t vmin(const int16x4_t & v0, const int16x4_t & v1) { return vmin_s16(v0, v1); }
inline uint32x2_t vmin(const uint32x2_t & v0, const uint32x2_t & v1) { return vmin_u32(v0, v1); }
inline int32x2_t vmin(const int32x2_t & v0, const int32x2_t & v1) { return vmin_s32(v0, v1); }
inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); }
////////////////////////////// vmaxq ///////////////////////
inline uint8x16_t vmaxq(const uint8x16_t & v0, const uint8x16_t & v1) { return vmaxq_u8 (v0, v1); }
inline int8x16_t vmaxq(const int8x16_t & v0, const int8x16_t & v1) { return vmaxq_s8 (v0, v1); }
inline uint16x8_t vmaxq(const uint16x8_t & v0, const uint16x8_t & v1) { return vmaxq_u16(v0, v1); }
inline int16x8_t vmaxq(const int16x8_t & v0, const int16x8_t & v1) { return vmaxq_s16(v0, v1); }
inline uint32x4_t vmaxq(const uint32x4_t & v0, const uint32x4_t & v1) { return vmaxq_u32(v0, v1); }
inline int32x4_t vmaxq(const int32x4_t & v0, const int32x4_t & v1) { return vmaxq_s32(v0, v1); }
inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }
////////////////////////////// vmax ///////////////////////
inline uint8x8_t vmax(const uint8x8_t & v0, const uint8x8_t & v1) { return vmax_u8 (v0, v1); }
inline int8x8_t vmax(const int8x8_t & v0, const int8x8_t & v1) { return vmax_s8 (v0, v1); }
inline uint16x4_t vmax(const uint16x4_t & v0, const uint16x4_t & v1) { return vmax_u16(v0, v1); }
inline int16x4_t vmax(const int16x4_t & v0, const int16x4_t & v1) { return vmax_s16(v0, v1); }
inline uint32x2_t vmax(const uint32x2_t & v0, const uint32x2_t & v1) { return vmax_u32(v0, v1); }
inline int32x2_t vmax(const int32x2_t & v0, const int32x2_t & v1) { return vmax_s32(v0, v1); }
inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }
////////////////////////////// vdupq_n ///////////////////////
inline uint8x16_t vdupq_n(const u8 & val) { return vdupq_n_u8(val); }
inline int8x16_t vdupq_n(const s8 & val) { return vdupq_n_s8(val); }
inline uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
inline int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); }
inline uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
inline int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); }
inline uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }
inline int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); }
inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); }
////////////////////////////// vdup_n ///////////////////////
inline uint8x8_t vdup_n(const u8 & val) { return vdup_n_u8(val); }
inline int8x8_t vdup_n(const s8 & val) { return vdup_n_s8(val); }
inline uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
inline int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
inline uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); }
inline int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
inline uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); }
inline int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); }
inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); }
////////////////////////////// vget_low ///////////////////////
inline uint8x8_t vget_low(const uint8x16_t & v) { return vget_low_u8 (v); }
inline int8x8_t vget_low(const int8x16_t & v) { return vget_low_s8 (v); }
inline uint16x4_t vget_low(const uint16x8_t & v) { return vget_low_u16(v); }
inline int16x4_t vget_low(const int16x8_t & v) { return vget_low_s16(v); }
inline uint32x2_t vget_low(const uint32x4_t & v) { return vget_low_u32(v); }
inline int32x2_t vget_low(const int32x4_t & v) { return vget_low_s32(v); }
inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); }
////////////////////////////// vget_high ///////////////////////
inline uint8x8_t vget_high(const uint8x16_t & v) { return vget_high_u8 (v); }
inline int8x8_t vget_high(const int8x16_t & v) { return vget_high_s8 (v); }
inline uint16x4_t vget_high(const uint16x8_t & v) { return vget_high_u16(v); }
inline int16x4_t vget_high(const int16x8_t & v) { return vget_high_s16(v); }
inline uint32x2_t vget_high(const uint32x4_t & v) { return vget_high_u32(v); }
inline int32x2_t vget_high(const int32x4_t & v) { return vget_high_s32(v); }
inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); }
////////////////////////////// vcombine ///////////////////////
inline uint8x16_t vcombine(const uint8x8_t & v0, const uint8x8_t & v1) { return vcombine_u8 (v0, v1); }
inline int8x16_t vcombine(const int8x8_t & v0, const int8x8_t & v1) { return vcombine_s8 (v0, v1); }
inline uint16x8_t vcombine(const uint16x4_t & v0, const uint16x4_t & v1) { return vcombine_u16(v0, v1); }
inline int16x8_t vcombine(const int16x4_t & v0, const int16x4_t & v1) { return vcombine_s16(v0, v1); }
inline uint32x4_t vcombine(const uint32x2_t & v0, const uint32x2_t & v1) { return vcombine_u32(v0, v1); }
inline int32x4_t vcombine(const int32x2_t & v0, const int32x2_t & v1) { return vcombine_s32(v0, v1); }
inline float32x4_t vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); }
////////////////////////////// vaddq ///////////////////////
inline uint8x16_t vaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vaddq_u8 (v0, v1); }
inline int8x16_t vaddq(const int8x16_t & v0, const int8x16_t & v1) { return vaddq_s8 (v0, v1); }
inline uint16x8_t vaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vaddq_u16(v0, v1); }
inline int16x8_t vaddq(const int16x8_t & v0, const int16x8_t & v1) { return vaddq_s16(v0, v1); }
inline uint32x4_t vaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vaddq_u32(v0, v1); }
inline int32x4_t vaddq(const int32x4_t & v0, const int32x4_t & v1) { return vaddq_s32(v0, v1); }
inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); }
////////////////////////////// vadd ///////////////////////
inline uint8x8_t vadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vadd_u8 (v0, v1); }
inline int8x8_t vadd(const int8x8_t & v0, const int8x8_t & v1) { return vadd_s8 (v0, v1); }
inline uint16x4_t vadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vadd_u16(v0, v1); }
inline int16x4_t vadd(const int16x4_t & v0, const int16x4_t & v1) { return vadd_s16(v0, v1); }
inline uint32x2_t vadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vadd_u32(v0, v1); }
inline int32x2_t vadd(const int32x2_t & v0, const int32x2_t & v1) { return vadd_s32(v0, v1); }
inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); }
////////////////////////////// vqaddq ///////////////////////
inline uint8x16_t vqaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqaddq_u8 (v0, v1); }
inline int8x16_t vqaddq(const int8x16_t & v0, const int8x16_t & v1) { return vqaddq_s8 (v0, v1); }
inline uint16x8_t vqaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqaddq_u16(v0, v1); }
inline int16x8_t vqaddq(const int16x8_t & v0, const int16x8_t & v1) { return vqaddq_s16(v0, v1); }
inline uint32x4_t vqaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqaddq_u32(v0, v1); }
inline int32x4_t vqaddq(const int32x4_t & v0, const int32x4_t & v1) { return vqaddq_s32(v0, v1); }
////////////////////////////// vqadd ///////////////////////
inline uint8x8_t vqadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vqadd_u8 (v0, v1); }
inline int8x8_t vqadd(const int8x8_t & v0, const int8x8_t & v1) { return vqadd_s8 (v0, v1); }
inline uint16x4_t vqadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vqadd_u16(v0, v1); }
inline int16x4_t vqadd(const int16x4_t & v0, const int16x4_t & v1) { return vqadd_s16(v0, v1); }
inline uint32x2_t vqadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vqadd_u32(v0, v1); }
inline int32x2_t vqadd(const int32x2_t & v0, const int32x2_t & v1) { return vqadd_s32(v0, v1); }
////////////////////////////// vsubq ///////////////////////
inline uint8x16_t vsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vsubq_u8 (v0, v1); }
inline int8x16_t vsubq(const int8x16_t & v0, const int8x16_t & v1) { return vsubq_s8 (v0, v1); }
inline uint16x8_t vsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vsubq_u16(v0, v1); }
inline int16x8_t vsubq(const int16x8_t & v0, const int16x8_t & v1) { return vsubq_s16(v0, v1); }
inline uint32x4_t vsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vsubq_u32(v0, v1); }
inline int32x4_t vsubq(const int32x4_t & v0, const int32x4_t & v1) { return vsubq_s32(v0, v1); }
inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); }
////////////////////////////// vsub ///////////////////////
inline uint8x8_t vsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vsub_u8 (v0, v1); }
inline int8x8_t vsub(const int8x8_t & v0, const int8x8_t & v1) { return vsub_s8 (v0, v1); }
inline uint16x4_t vsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vsub_u16(v0, v1); }
inline int16x4_t vsub(const int16x4_t & v0, const int16x4_t & v1) { return vsub_s16(v0, v1); }
inline uint32x2_t vsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vsub_u32(v0, v1); }
inline int32x2_t vsub(const int32x2_t & v0, const int32x2_t & v1) { return vsub_s32(v0, v1); }
inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); }
////////////////////////////// vqsubq ///////////////////////
inline uint8x16_t vqsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqsubq_u8 (v0, v1); }
inline int8x16_t vqsubq(const int8x16_t & v0, const int8x16_t & v1) { return vqsubq_s8 (v0, v1); }
inline uint16x8_t vqsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqsubq_u16(v0, v1); }
inline int16x8_t vqsubq(const int16x8_t & v0, const int16x8_t & v1) { return vqsubq_s16(v0, v1); }
inline uint32x4_t vqsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqsubq_u32(v0, v1); }
inline int32x4_t vqsubq(const int32x4_t & v0, const int32x4_t & v1) { return vqsubq_s32(v0, v1); }
inline uint64x2_t vqsubq(const uint64x2_t & v0, const uint64x2_t & v1) { return vqsubq_u64(v0, v1); }
inline int64x2_t vqsubq(const int64x2_t & v0, const int64x2_t & v1) { return vqsubq_s64(v0, v1); }
////////////////////////////// vqsub ///////////////////////
inline uint8x8_t vqsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vqsub_u8 (v0, v1); }
inline int8x8_t vqsub(const int8x8_t & v0, const int8x8_t & v1) { return vqsub_s8 (v0, v1); }
inline uint16x4_t vqsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vqsub_u16(v0, v1); }
inline int16x4_t vqsub(const int16x4_t & v0, const int16x4_t & v1) { return vqsub_s16(v0, v1); }
inline uint32x2_t vqsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vqsub_u32(v0, v1); }
inline int32x2_t vqsub(const int32x2_t & v0, const int32x2_t & v1) { return vqsub_s32(v0, v1); }
inline uint64x1_t vqsub(const uint64x1_t & v0, const uint64x1_t & v1) { return vqsub_u64(v0, v1); }
inline int64x1_t vqsub(const int64x1_t & v0, const int64x1_t & v1) { return vqsub_s64(v0, v1); }
////////////////////////////// vmull ///////////////////////
inline uint16x8_t vmull(const uint8x8_t & v0, const uint8x8_t & v1) { return vmull_u8 (v0, v1); }
inline int16x8_t vmull(const int8x8_t & v0, const int8x8_t & v1) { return vmull_s8 (v0, v1); }
inline uint32x4_t vmull(const uint16x4_t & v0, const uint16x4_t & v1) { return vmull_u16(v0, v1); }
inline int32x4_t vmull(const int16x4_t & v0, const int16x4_t & v1) { return vmull_s16(v0, v1); }
inline uint64x2_t vmull(const uint32x2_t & v0, const uint32x2_t & v1) { return vmull_u32(v0, v1); }
inline int64x2_t vmull(const int32x2_t & v0, const int32x2_t & v1) { return vmull_s32(v0, v1); }
////////////////////////////// vrev64q ///////////////////////
inline uint8x16_t vrev64q(const uint8x16_t & v) { return vrev64q_u8 (v); }
inline int8x16_t vrev64q(const int8x16_t & v) { return vrev64q_s8 (v); }
inline uint16x8_t vrev64q(const uint16x8_t & v) { return vrev64q_u16(v); }
inline int16x8_t vrev64q(const int16x8_t & v) { return vrev64q_s16(v); }
inline uint32x4_t vrev64q(const uint32x4_t & v) { return vrev64q_u32(v); }
inline int32x4_t vrev64q(const int32x4_t & v) { return vrev64q_s32(v); }
inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); }
////////////////////////////// vrev64 ///////////////////////
inline uint8x8_t vrev64(const uint8x8_t & v) { return vrev64_u8 (v); }
inline int8x8_t vrev64(const int8x8_t & v) { return vrev64_s8 (v); }
inline uint16x4_t vrev64(const uint16x4_t & v) { return vrev64_u16(v); }
inline int16x4_t vrev64(const int16x4_t & v) { return vrev64_s16(v); }
inline uint32x2_t vrev64(const uint32x2_t & v) { return vrev64_u32(v); }
inline int32x2_t vrev64(const int32x2_t & v) { return vrev64_s32(v); }
inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); }
////////////////////////////// vceqq ///////////////////////
inline uint8x16_t vceqq(const uint8x16_t & v0, const uint8x16_t & v1) { return vceqq_u8 (v0, v1); }
inline uint8x16_t vceqq(const int8x16_t & v0, const int8x16_t & v1) { return vceqq_s8 (v0, v1); }
inline uint16x8_t vceqq(const uint16x8_t & v0, const uint16x8_t & v1) { return vceqq_u16(v0, v1); }
inline uint16x8_t vceqq(const int16x8_t & v0, const int16x8_t & v1) { return vceqq_s16(v0, v1); }
inline uint32x4_t vceqq(const uint32x4_t & v0, const uint32x4_t & v1) { return vceqq_u32(v0, v1); }
inline uint32x4_t vceqq(const int32x4_t & v0, const int32x4_t & v1) { return vceqq_s32(v0, v1); }
inline uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }
////////////////////////////// vceq ///////////////////////
inline uint8x8_t vceq(const uint8x8_t & v0, const uint8x8_t & v1) { return vceq_u8 (v0, v1); }
inline uint8x8_t vceq(const int8x8_t & v0, const int8x8_t & v1) { return vceq_s8 (v0, v1); }
inline uint16x4_t vceq(const uint16x4_t & v0, const uint16x4_t & v1) { return vceq_u16(v0, v1); }
inline uint16x4_t vceq(const int16x4_t & v0, const int16x4_t & v1) { return vceq_s16(v0, v1); }
inline uint32x2_t vceq(const uint32x2_t & v0, const uint32x2_t & v1) { return vceq_u32(v0, v1); }
inline uint32x2_t vceq(const int32x2_t & v0, const int32x2_t & v1) { return vceq_s32(v0, v1); }
inline uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); }
////////////////////////////// vcgtq ///////////////////////
inline uint8x16_t vcgtq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgtq_u8 (v0, v1); }
inline uint8x16_t vcgtq(const int8x16_t & v0, const int8x16_t & v1) { return vcgtq_s8 (v0, v1); }
inline uint16x8_t vcgtq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgtq_u16(v0, v1); }
inline uint16x8_t vcgtq(const int16x8_t & v0, const int16x8_t & v1) { return vcgtq_s16(v0, v1); }
inline uint32x4_t vcgtq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgtq_u32(v0, v1); }
inline uint32x4_t vcgtq(const int32x4_t & v0, const int32x4_t & v1) { return vcgtq_s32(v0, v1); }
inline uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); }
////////////////////////////// vcgt ///////////////////////
inline uint8x8_t vcgt(const uint8x8_t & v0, const uint8x8_t & v1) { return vcgt_u8 (v0, v1); }
inline uint8x8_t vcgt(const int8x8_t & v0, const int8x8_t & v1) { return vcgt_s8 (v0, v1); }
inline uint16x4_t vcgt(const uint16x4_t & v0, const uint16x4_t & v1) { return vcgt_u16(v0, v1); }
inline uint16x4_t vcgt(const int16x4_t & v0, const int16x4_t & v1) { return vcgt_s16(v0, v1); }
inline uint32x2_t vcgt(const uint32x2_t & v0, const uint32x2_t & v1) { return vcgt_u32(v0, v1); }
inline uint32x2_t vcgt(const int32x2_t & v0, const int32x2_t & v1) { return vcgt_s32(v0, v1); }
inline uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); }
////////////////////////////// vcgeq ///////////////////////
inline uint8x16_t vcgeq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgeq_u8 (v0, v1); }
inline uint8x16_t vcgeq(const int8x16_t & v0, const int8x16_t & v1) { return vcgeq_s8 (v0, v1); }
inline uint16x8_t vcgeq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgeq_u16(v0, v1); }
inline uint16x8_t vcgeq(const int16x8_t & v0, const int16x8_t & v1) { return vcgeq_s16(v0, v1); }
inline uint32x4_t vcgeq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgeq_u32(v0, v1); }
inline uint32x4_t vcgeq(const int32x4_t & v0, const int32x4_t & v1) { return vcgeq_s32(v0, v1); }
inline uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); }
////////////////////////////// vcge ///////////////////////
inline uint8x8_t vcge(const uint8x8_t & v0, const uint8x8_t & v1) { return vcge_u8 (v0, v1); }
inline uint8x8_t vcge(const int8x8_t & v0, const int8x8_t & v1) { return vcge_s8 (v0, v1); }
inline uint16x4_t vcge(const uint16x4_t & v0, const uint16x4_t & v1) { return vcge_u16(v0, v1); }
inline uint16x4_t vcge(const int16x4_t & v0, const int16x4_t & v1) { return vcge_s16(v0, v1); }
inline uint32x2_t vcge(const uint32x2_t & v0, const uint32x2_t & v1) { return vcge_u32(v0, v1); }
inline uint32x2_t vcge(const int32x2_t & v0, const int32x2_t & v1) { return vcge_s32(v0, v1); }
inline uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); }
////////////////////////////// vandq ///////////////////////
inline uint8x16_t vandq(const uint8x16_t & v0, const uint8x16_t & v1) { return vandq_u8 (v0, v1); }
inline int8x16_t vandq(const int8x16_t & v0, const int8x16_t & v1) { return vandq_s8 (v0, v1); }
inline uint16x8_t vandq(const uint16x8_t & v0, const uint16x8_t & v1) { return vandq_u16(v0, v1); }
inline int16x8_t vandq(const int16x8_t & v0, const int16x8_t & v1) { return vandq_s16(v0, v1); }
inline uint32x4_t vandq(const uint32x4_t & v0, const uint32x4_t & v1) { return vandq_u32(v0, v1); }
inline int32x4_t vandq(const int32x4_t & v0, const int32x4_t & v1) { return vandq_s32(v0, v1); }
////////////////////////////// vand ///////////////////////
inline uint8x8_t vand(const uint8x8_t & v0, const uint8x8_t & v1) { return vand_u8 (v0, v1); }
inline int8x8_t vand(const int8x8_t & v0, const int8x8_t & v1) { return vand_s8 (v0, v1); }
inline uint16x4_t vand(const uint16x4_t & v0, const uint16x4_t & v1) { return vand_u16(v0, v1); }
inline int16x4_t vand(const int16x4_t & v0, const int16x4_t & v1) { return vand_s16(v0, v1); }
inline uint32x2_t vand(const uint32x2_t & v0, const uint32x2_t & v1) { return vand_u32(v0, v1); }
inline int32x2_t vand(const int32x2_t & v0, const int32x2_t & v1) { return vand_s32(v0, v1); }
////////////////////////////// vmovn ///////////////////////
inline uint8x8_t vmovn(const uint16x8_t & v) { return vmovn_u16(v); }
inline int8x8_t vmovn(const int16x8_t & v) { return vmovn_s16(v); }
inline uint16x4_t vmovn(const uint32x4_t & v) { return vmovn_u32(v); }
inline int16x4_t vmovn(const int32x4_t & v) { return vmovn_s32(v); }
inline uint32x2_t vmovn(const uint64x2_t & v) { return vmovn_u64(v); }
inline int32x2_t vmovn(const int64x2_t & v) { return vmovn_s64(v); }
////////////////////////////// vqmovn ///////////////////////
inline uint8x8_t vqmovn(const uint16x8_t & v) { return vqmovn_u16(v); }
inline int8x8_t vqmovn(const int16x8_t & v) { return vqmovn_s16(v); }
inline uint16x4_t vqmovn(const uint32x4_t & v) { return vqmovn_u32(v); }
inline int16x4_t vqmovn(const int32x4_t & v) { return vqmovn_s32(v); }
inline uint32x2_t vqmovn(const uint64x2_t & v) { return vqmovn_u64(v); }
inline int32x2_t vqmovn(const int64x2_t & v) { return vqmovn_s64(v); }
////////////////////////////// vmovl ///////////////////////
inline uint16x8_t vmovl(const uint8x8_t & v) { return vmovl_u8(v); }
inline int16x8_t vmovl(const int8x8_t & v) { return vmovl_s8(v); }
inline uint32x4_t vmovl(const uint16x4_t & v) { return vmovl_u16(v); }
inline int32x4_t vmovl(const int16x4_t & v) { return vmovl_s16(v); }
////////////////////////////// vmvnq ///////////////////////
inline uint8x16_t vmvnq(const uint8x16_t & v) { return vmvnq_u8 (v); }
inline int8x16_t vmvnq(const int8x16_t & v) { return vmvnq_s8 (v); }
inline uint16x8_t vmvnq(const uint16x8_t & v) { return vmvnq_u16(v); }
inline int16x8_t vmvnq(const int16x8_t & v) { return vmvnq_s16(v); }
inline uint32x4_t vmvnq(const uint32x4_t & v) { return vmvnq_u32(v); }
inline int32x4_t vmvnq(const int32x4_t & v) { return vmvnq_s32(v); }
////////////////////////////// vmvn ///////////////////////
inline uint8x8_t vmvn(const uint8x8_t & v) { return vmvn_u8 (v); }
inline int8x8_t vmvn(const int8x8_t & v) { return vmvn_s8 (v); }
inline uint16x4_t vmvn(const uint16x4_t & v) { return vmvn_u16(v); }
inline int16x4_t vmvn(const int16x4_t & v) { return vmvn_s16(v); }
inline uint32x2_t vmvn(const uint32x2_t & v) { return vmvn_u32(v); }
inline int32x2_t vmvn(const int32x2_t & v) { return vmvn_s32(v); }
////////////////////////////// vbicq ///////////////////////
inline uint8x16_t vbicq(const uint8x16_t & v0, const uint8x16_t & v1) { return vbicq_u8 (v0, v1); }
inline int8x16_t vbicq(const int8x16_t & v0, const int8x16_t & v1) { return vbicq_s8 (v0, v1); }
inline uint16x8_t vbicq(const uint16x8_t & v0, const uint16x8_t & v1) { return vbicq_u16(v0, v1); }
inline int16x8_t vbicq(const int16x8_t & v0, const int16x8_t & v1) { return vbicq_s16(v0, v1); }
inline uint32x4_t vbicq(const uint32x4_t & v0, const uint32x4_t & v1) { return vbicq_u32(v0, v1); }
inline int32x4_t vbicq(const int32x4_t & v0, const int32x4_t & v1) { return vbicq_s32(v0, v1); }
inline uint64x2_t vbicq(const uint64x2_t & v0, const uint64x2_t & v1) { return vbicq_u64(v0, v1); }
inline int64x2_t vbicq(const int64x2_t & v0, const int64x2_t & v1) { return vbicq_s64(v0, v1); }
////////////////////////////// vbic ///////////////////////
inline uint8x8_t vbic(const uint8x8_t & v0, const uint8x8_t & v1) { return vbic_u8 (v0, v1); }
inline int8x8_t vbic(const int8x8_t & v0, const int8x8_t & v1) { return vbic_s8 (v0, v1); }
inline uint16x4_t vbic(const uint16x4_t & v0, const uint16x4_t & v1) { return vbic_u16(v0, v1); }
inline int16x4_t vbic(const int16x4_t & v0, const int16x4_t & v1) { return vbic_s16(v0, v1); }
inline uint32x2_t vbic(const uint32x2_t & v0, const uint32x2_t & v1) { return vbic_u32(v0, v1); }
inline int32x2_t vbic(const int32x2_t & v0, const int32x2_t & v1) { return vbic_s32(v0, v1); }
inline uint64x1_t vbic(const uint64x1_t & v0, const uint64x1_t & v1) { return vbic_u64(v0, v1); }
inline int64x1_t vbic(const int64x1_t & v0, const int64x1_t & v1) { return vbic_s64(v0, v1); }
////////////////////////////// vtransform ///////////////////////
template <typename Op>
void vtransform(Size2D size,
const typename Op::type * src0Base, ptrdiff_t src0Stride,
const typename Op::type * src1Base, ptrdiff_t src1Stride,
typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op)
{
typedef typename Op::type type;
typedef typename VecTraits<type>::vec128 vec128;
typedef typename VecTraits<type>::vec64 vec64;
if (src0Stride == src1Stride && src0Stride == dstStride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
{
size.width *= size.height;
size.height = 1;
}
const size_t step_base = 32 / sizeof(type);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
const size_t step_tail = 8 / sizeof(type);
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t y = 0; y < size.height; ++y)
{
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for( ; x < roiw_base; x += step_base )
{
internal::prefetch(src0 + x);
internal::prefetch(src1 + x);
vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type));
vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type));
vec128 v_dst;
op(v_src00, v_src10, v_dst);
vst1q(dst + x, v_dst);
op(v_src01, v_src11, v_dst);
vst1q(dst + x + 16 / sizeof(type), v_dst);
}
for( ; x < roiw_tail; x += step_tail )
{
vec64 v_src0 = vld1(src0 + x);
vec64 v_src1 = vld1(src1 + x);
vec64 v_dst;
op(v_src0, v_src1, v_dst);
vst1(dst + x, v_dst);
}
for (; x < size.width; ++x)
{
op(src0 + x, src1 + x, dst + x);
}
}
}
} }
#endif // CAROTENE_NEON
#endif

434
3rdparty/carotene/src/warp_affine.cpp vendored Normal file
View File

@ -0,0 +1,434 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
bool isWarpAffineNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isWarpAffineLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void warpAffineLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,464 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isWarpPerspectiveLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
float32x4_t v_m6 = vdupq_n_f32(m[6]);
float32x4_t v_m7 = vdupq_n_f32(m[7]);
float32x4_t v_m8 = vdupq_n_f32(m[8]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
float32x4_t v_m6 = vdupq_n_f32(m[6]);
float32x4_t v_m7 = vdupq_n_f32(m[7]);
float32x4_t v_m8 = vdupq_n_f32(m[8]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS