Merge pull request #2 from opencv/master

update to master
This commit is contained in:
Eric Sommerlade 2016-10-25 18:47:12 +01:00 committed by GitHub
commit 5ebdf6cedd
2328 changed files with 654012 additions and 131476 deletions

30
.github/ISSUE_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,30 @@
<!--
If you have a question rather than reporting a bug please go to http://answers.opencv.org where you get much faster responses.
If you need further assistance please read [How To Contribute](https://github.com/opencv/opencv/wiki/How_to_contribute).
This is a template helping you to create an issue which can be processed as quickly as possible. This is the bug reporting section for the OpenCV library.
-->
##### System information (version)
<!-- Example
- OpenCV => 3.1
- Operating System / Platform => Windows 64 Bit
- Compiler => Visual Studio 2015
-->
- OpenCV => :grey_question:
- Operating System / Platform => :grey_question:
- Compiler => :grey_question:
##### Detailed description
<!-- your description -->
##### Steps to reproduce
<!-- to add code example fence it with triple backticks and optional file extension
```.cpp
// C++ code example
```
or attach as .txt or .zip file
-->

9
.github/PULL_REQUEST_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,9 @@
<!-- Please use this line to close one or multiple issues when this pullrequest gets merged
You can add another line right under the first one:
resolves #1234
resolves #1235
-->
### This pullrequest changes
<!-- Please describe what your pullrequest is changing -->

15
.gitignore vendored
View File

@ -8,3 +8,18 @@
Thumbs.db
tags
tegra/
bin/
*.sdf
*.opensdf
*.obj
*.stamp
*.depend
*.rule
*.tmp
*/debug
*/CMakeFiles
CMakeCache.txt
*.suo
*.log
*.tlog
build

8
3rdparty/carotene/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Gedit temp files
*~
# Qt Creator file
*.user
# MacOS-specific (Desktop Services Store)
.DS_Store

42
3rdparty/carotene/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,42 @@
cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
project(Carotene)
set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions")
set(CAROTENE_INCLUDE_DIR include)
set(CAROTENE_SOURCE_DIR src)
file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp")
file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp"
"${CAROTENE_SOURCE_DIR}/*.hpp")
include_directories(${CAROTENE_INCLUDE_DIR})
if(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}")
# allow more inlines - these parameters improve performance for:
# - matchTemplate about 5-10%
# - goodFeaturesToTrack 10-20%
# - cornerHarris 30% for some cases
set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
add_library(carotene_objs OBJECT
${carotene_headers}
${carotene_sources}
)
if(NOT CAROTENE_NS STREQUAL "carotene")
target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}")
endif()
if(WITH_NEON)
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
endif()
set_target_properties(carotene_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")

2
3rdparty/carotene/README.md vendored Normal file
View File

@ -0,0 +1,2 @@
This is Carotene, a low-level library containing optimized CPU routines
that are useful for computer vision algorithms.

114
3rdparty/carotene/hal/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,114 @@
cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM TRUE)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*")
set(AARCH64 TRUE)
endif()
set(TEGRA_COMPILER_FLAGS "")
if(CMAKE_COMPILER_IS_GNUCXX)
# Generate unwind information even for functions that can't throw/propagate exceptions.
# This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols.
list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables)
endif()
if(CMAKE_COMPILER_IS_GNUCXX)
if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6"))
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
elseif(CMAKE_COMPILER_IS_CLANGCXX)
list(APPEND TEGRA_COMPILER_FLAGS -fwrapv)
else()
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces
-fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
endif()
if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" )
list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange
-floop-strip-mine -floop-parallelize-all -ftree-loop-linear)
endif()
endif()
string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}")
if(ARMEABI_V7A)
if (CMAKE_COMPILER_IS_GNUCXX)
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" )
endif()
endif()
if(WITH_LOGS)
add_definitions(-DHAVE_LOGS)
endif()
set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE)
function(compile_carotene)
if(ENABLE_NEON)
set(WITH_NEON ON)
endif()
add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene")
if(ARM OR AARCH64)
if(CMAKE_BUILD_TYPE)
set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
endif()
check_cxx_compiler_flag("-mfpu=neon" CXX_HAS_MFPU_NEON)
check_c_compiler_flag("-mfpu=neon" C_HAS_MFPU_NEON)
if(${CXX_HAS_MFPU_NEON} AND ${C_HAS_MFPU_NEON})
get_target_property(old_flags "carotene_objs" COMPILE_FLAGS)
if(old_flags)
set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "${old_flags} -mfpu=neon")
else()
set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "-mfpu=neon")
endif()
endif()
endif()
endfunction()
compile_carotene()
include_directories("${CAROTENE_DIR}/include")
get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS)
set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
if (CMAKE_COMPILER_IS_GNUCXX)
# allow more inlines - these parameters improve performance for:
# matchTemplate about 5-10%
# goodFeaturesToTrack 10-20%
# cornerHarris 30% for some cases
set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
# set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
set_target_properties(tegra_hal PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
if(NOT BUILD_SHARED_LIBS)
ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
endif()
target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE)
set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE)
set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE)
set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE)
configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY)

1851
3rdparty/carotene/hal/tegra_hal.hpp vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,47 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_DEFINITIONS_HPP
#define CAROTENE_DEFINITIONS_HPP
#ifndef CAROTENE_NS
#define CAROTENE_NS carotene
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,125 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_TYPES_HPP
#define CAROTENE_TYPES_HPP
#include <carotene/definitions.hpp>
#include <stdint.h>
#include <cstddef>
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif
namespace CAROTENE_NS {
using std::size_t;
using std::ptrdiff_t;
typedef int8_t s8;
typedef uint8_t u8;
typedef int16_t s16;
typedef uint16_t u16;
typedef int32_t s32;
typedef uint32_t u32;
typedef float f32;
typedef int64_t s64;
typedef uint64_t u64;
typedef double f64;
typedef ptrdiff_t stride_t;
enum CONVERT_POLICY
{
CONVERT_POLICY_WRAP,
CONVERT_POLICY_SATURATE
};
enum BORDER_MODE
{
BORDER_MODE_UNDEFINED,
BORDER_MODE_CONSTANT,
BORDER_MODE_REPLICATE,
BORDER_MODE_REFLECT,
BORDER_MODE_REFLECT101,
BORDER_MODE_WRAP
};
enum FLIP_MODE
{
FLIP_HORIZONTAL_MODE = 1,
FLIP_VERTICAL_MODE = 2,
FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE
};
enum COLOR_SPACE
{
COLOR_SPACE_BT601,
COLOR_SPACE_BT709
};
struct Size2D {
Size2D() : width(0), height(0) {}
Size2D(size_t width_, size_t height_) : width(width_), height(height_) {}
size_t width;
size_t height;
inline size_t total() const
{
return width * height;
}
};
struct Margin {
Margin() : left(0), right(0), top(0), bottom(0) {}
Margin(size_t left_, size_t right_, size_t top_, size_t bottom_)
: left(left_), right(right_), top(top_), bottom(bottom_) {}
// these are measured in elements
size_t left, right, top, bottom;
};
struct KeypointStore {
virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0;
virtual ~KeypointStore() {};
};
}
#endif

241
3rdparty/carotene/src/absdiff.cpp vendored Normal file
View File

@ -0,0 +1,241 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct AbsDiff
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vabdq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vabd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0];
}
};
template <typename T>
struct AbsDiffSigned
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
typename internal::VecTraits<T>::vec128 v_min = internal::vminq(v_src0, v_src1);
typename internal::VecTraits<T>::vec128 v_max = internal::vmaxq(v_src0, v_src1);
v_dst = internal::vqsubq(v_max, v_min);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
typename internal::VecTraits<T>::vec64 v_min = internal::vmin(v_src0, v_src1);
typename internal::VecTraits<T>::vec64 v_max = internal::vmax(v_src0, v_src1);
v_dst = internal::vqsub(v_max, v_min);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]);
}
};
} // namespace
#endif
void absDiff(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<u8>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const u16 *src0Base, ptrdiff_t src0Stride,
const u16 *src1Base, ptrdiff_t src1Stride,
u16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<u16>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s8 *src0Base, ptrdiff_t src0Stride,
const s8 *src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s8>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s16 *src0Base, ptrdiff_t src0Stride,
const s16 *src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s16>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s32 *src0Base, ptrdiff_t src0Stride,
const s32 *src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

408
3rdparty/carotene/src/accumulate.cpp vendored Normal file
View File

@ -0,0 +1,408 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
void accumulate(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j);
int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
v_dst0 = vqaddq_s16(v_dst0, v_src0);
v_dst1 = vqaddq_s16(v_dst1, v_src1);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
int16x8_t v_dst = vld1q_s16(dst + j);
v_dst = vqaddq_s16(v_dst, v_src16);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
void accumulateSquareConst(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
v_srclo = vget_low_s16(v_src1);
v_srchi = vget_high_s16(v_src1);
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
int16x8_t v_dst = vld1q_s16(dst + j);
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
s32 srcVal = src[j];
dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
}
}
}
template <>
void accumulateSquareConst<0>(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
v_srclo = vget_low_s16(v_src1);
v_srchi = vget_high_s16(v_src1);
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
int16x8_t v_dst = vld1q_s16(dst + j);
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
s32 srcVal = src[j];
dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
}
}
}
typedef void (* accumulateSquareConstFunc)(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride);
} // namespace
#endif
void accumulateSquare(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride,
u32 shift)
{
if (shift >= 16)
{
for (size_t i = 0; i < size.height; ++i)
{
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(s16) * size.width);
}
return;
}
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
accumulateSquareConstFunc funcs[16] =
{
accumulateSquareConst<0>,
accumulateSquareConst<1>,
accumulateSquareConst<2>,
accumulateSquareConst<3>,
accumulateSquareConst<4>,
accumulateSquareConst<5>,
accumulateSquareConst<6>,
accumulateSquareConst<7>,
accumulateSquareConst<8>,
accumulateSquareConst<9>,
accumulateSquareConst<10>,
accumulateSquareConst<11>,
accumulateSquareConst<12>,
accumulateSquareConst<13>,
accumulateSquareConst<14>,
accumulateSquareConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
#endif
}
#ifdef CAROTENE_NEON
namespace {
struct AccumulateWeightedHalf
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vhaddq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vhadd_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
}
};
struct AccumulateWeighted
{
typedef u8 type;
float alpha, beta;
float32x4_t v_alpha, v_beta;
explicit AccumulateWeighted(float _alpha) :
alpha(_alpha), beta(1 - _alpha)
{
v_alpha = vdupq_n_f32(alpha);
v_beta = vdupq_n_f32(beta);
}
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
}
void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
uint8x8_t & v_dst) const
{
uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_dst = vmovn_u16(_v_dst);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = beta * src1[0] + alpha * src0[0];
}
};
} // namespace
#endif
void accumulateWeighted(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
u8 *dstBase, ptrdiff_t dstStride,
f32 alpha)
{
if (alpha == 0.0f)
return;
if (alpha == 1.0f)
{
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memcpy(dst, src, sizeof(u8) * size.width);
}
return;
}
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
// in this case we can use the following scheme:
// dst[p] = (src[p] + dst[p]) >> 1
// which is faster
if (alpha == 0.5f)
{
internal::vtransform(size,
srcBase, srcStride,
dstBase, dstStride,
dstBase, dstStride,
AccumulateWeightedHalf());
return;
}
internal::vtransform(size,
srcBase, srcStride,
dstBase, dstStride,
dstBase, dstStride,
AccumulateWeighted(alpha));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)alpha;
#endif
}
} //namespace CAROTENE_NS

475
3rdparty/carotene/src/add.cpp vendored Normal file
View File

@ -0,0 +1,475 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct AddWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
}
};
template <typename T, typename WT>
struct AddSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
}
};
} // namespace
#endif
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u8, u16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u8, u16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (u16)src0[j] + (u16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u16, u32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u16, u32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u32, u64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u32, u64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

265
3rdparty/carotene/src/add_weighted.cpp vendored Normal file
View File

@ -0,0 +1,265 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
using namespace internal;
template <typename T> struct TypeTraits;
template <> struct TypeTraits< u8> { typedef u16 wide; typedef u8 unsign; typedef uint8x16_t vec128; };
template <> struct TypeTraits< s8> { typedef s16 wide; typedef u8 unsign; typedef int8x16_t vec128; };
template <> struct TypeTraits<u16> { typedef u32 wide; typedef u8 narrow; typedef u16 unsign; typedef uint16x8_t vec128; };
template <> struct TypeTraits<s16> { typedef s32 wide; typedef s8 narrow; typedef u16 unsign; typedef int16x8_t vec128; };
template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef uint32x4_t vec128; };
template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef int32x4_t vec128; };
template <> struct TypeTraits<f32> { typedef f64 wide; typedef float32x4_t vec128; };
template <typename T> struct wAdd
{
typedef T type;
f32 alpha, beta, gamma;
typedef typename TypeTraits<T>::wide wtype;
wAdd<wtype> wideAdd;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma),
wideAdd(_alpha, _beta, _gamma) {}
void operator() (const typename VecTraits<T>::vec128 & v_src0,
const typename VecTraits<T>::vec128 & v_src1,
typename VecTraits<T>::vec128 & v_dst) const
{
typename VecTraits<wtype>::vec128 vrl, vrh;
wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);
wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);
v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));
}
void operator() (const typename VecTraits<T>::vec64 & v_src0,
const typename VecTraits<T>::vec64 & v_src1,
typename VecTraits<T>::vec64 & v_dst) const
{
typename VecTraits<wtype>::vec128 vr;
wideAdd(vmovl(v_src0), vmovl(v_src1), vr);
v_dst = vqmovn(vr);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<s32>
{
typedef s32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<s32>::vec128 & v_src0,
const typename VecTraits<s32>::vec128 & v_src1,
typename VecTraits<s32>::vec128 & v_dst) const
{
float32x4_t vs1 = vcvtq_f32_s32(v_src0);
float32x4_t vs2 = vcvtq_f32_s32(v_src1);
vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_s32_f32(vs1);
}
void operator() (const typename VecTraits<s32>::vec64 & v_src0,
const typename VecTraits<s32>::vec64 & v_src1,
typename VecTraits<s32>::vec64 & v_dst) const
{
float32x2_t vs1 = vcvt_f32_s32(v_src0);
float32x2_t vs2 = vcvt_f32_s32(v_src1);
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_s32_f32(vs1);
}
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
{
dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<u32>
{
typedef u32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<u32>::vec128 & v_src0,
const typename VecTraits<u32>::vec128 & v_src1,
typename VecTraits<u32>::vec128 & v_dst) const
{
float32x4_t vs1 = vcvtq_f32_u32(v_src0);
float32x4_t vs2 = vcvtq_f32_u32(v_src1);
vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_u32_f32(vs1);
}
void operator() (const typename VecTraits<u32>::vec64 & v_src0,
const typename VecTraits<u32>::vec64 & v_src1,
typename VecTraits<u32>::vec64 & v_dst) const
{
float32x2_t vs1 = vcvt_f32_u32(v_src0);
float32x2_t vs2 = vcvt_f32_u32(v_src1);
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_u32_f32(vs1);
}
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
{
dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<f32>
{
typedef f32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<f32>::vec128 & v_src0,
const typename VecTraits<f32>::vec128 & v_src1,
typename VecTraits<f32>::vec128 & v_dst) const
{
float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);
v_dst = vmlaq_f32(vs1, v_src1, vbeta);
}
void operator() (const typename VecTraits<f32>::vec64 & v_src0,
const typename VecTraits<f32>::vec64 & v_src1,
typename VecTraits<f32>::vec64 & v_dst) const
{
float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));
v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = alpha*src0[0] + beta*src1[0] + gamma;
}
};
} // namespace
#define IMPL_ADDWEIGHTED(type) \
void addWeighted(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride, \
f32 alpha, f32 beta, f32 gamma) \
{ \
internal::assertSupportedConfiguration(); \
wAdd<type> wgtAdd(alpha, \
beta, \
gamma); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, \
wgtAdd); \
}
#else
#define IMPL_ADDWEIGHTED(type) \
void addWeighted(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t, \
f32, f32, f32) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
IMPL_ADDWEIGHTED(u8)
IMPL_ADDWEIGHTED(s8)
IMPL_ADDWEIGHTED(u16)
IMPL_ADDWEIGHTED(s16)
IMPL_ADDWEIGHTED(u32)
IMPL_ADDWEIGHTED(s32)
IMPL_ADDWEIGHTED(f32)
} // namespace CAROTENE_NS

225
3rdparty/carotene/src/bitwise.cpp vendored Normal file
View File

@ -0,0 +1,225 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
struct BitwiseAnd
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vandq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vand_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] & src1[0];
}
};
struct BitwiseOr
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vorrq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vorr_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] | src1[0];
}
};
struct BitwiseXor
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = veorq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = veor_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] ^ src1[0];
}
};
#endif
void bitwiseNot(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src + j);
uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1);
vst1q_u8(dst + j, v_dst0);
vst1q_u8(dst + j + 16, v_dst1);
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_dst = vmvn_u8(v_src);
vst1_u8(dst + j, v_dst);
}
for (; j < size.width; j++)
{
dst[j] = ~src[j];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseAnd(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseAnd());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseOr(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseOr());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseXor(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseXor());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

1337
3rdparty/carotene/src/blur.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

773
3rdparty/carotene/src/canny.cpp vendored Normal file
View File

@ -0,0 +1,773 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct RowFilter3x3Canny
{
inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr)
{
vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
lookLeft = offsetk - borderxl;
lookRight = offsetk - borderxr;
}
inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width)
{
uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
ptrdiff_t i = 0;
for (; i < width - 8 + lookRight; i += 8)
{
internal::prefetch(src + i);
uint8x8_t l18u = vld1_u8(src + i + 1);
uint8x8_t l2 = l18u;
uint8x8_t l0 = vext_u8(l, l18u, 6);
int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1));
l = l18u;
int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0));
int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0));
int16x8_t ldy = vaddq_s16(l02, l1x2);
vst1q_s16(dstx + i, ldx);
vst1q_s16(dsty + i, ldy);
}
//tail
if (lookRight == 0 || i != width)
{
uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0));
int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
int16x8_t taildy = vqaddq_s16(tail02, tail1x2);
vst1q_s16(dstx + (width - 8), taildx);
vst1q_s16(dsty + (width - 8), taildy);
}
}
uint8x8_t vfmask;
uint8x8_t vtmask;
enum { offsetk = 1};
ptrdiff_t lookLeft;
ptrdiff_t lookRight;
};
template <bool L2gradient>
inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
{
ptrdiff_t j = 0;
for (; j <= width - 8; j += 8)
{
ColFilter3x3CannyL1Loop:
int16x8_t line0x = vld1q_s16(src0 + j);
int16x8_t line1x = vld1q_s16(src1 + j);
int16x8_t line2x = vld1q_s16(src2 + j);
int16x8_t line0y = vld1q_s16(src0 + j + width);
int16x8_t line2y = vld1q_s16(src2 + j + width);
int16x8_t l02 = vaddq_s16(line0x, line2x);
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
int16x8_t dy = vsubq_s16(line2y, line0y);
int16x8_t dx = vaddq_s16(l1x2, l02);
int16x8_t dya = vabsq_s16(dy);
int16x8_t dxa = vabsq_s16(dx);
int16x8_t norm = vaddq_s16(dya, dxa);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s16(dsty + j, dy);
vst1q_s16(dstx + j, dx);
vst1q_s32(mag + j + 4, normh);
vst1q_s32(mag + j, norml);
}
if (j != width)
{
j = width - 8;
goto ColFilter3x3CannyL1Loop;
}
}
template <>
inline void ColFilter3x3Canny<true>(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
{
ptrdiff_t j = 0;
for (; j <= width - 8; j += 8)
{
ColFilter3x3CannyL2Loop:
int16x8_t line0x = vld1q_s16(src0 + j);
int16x8_t line1x = vld1q_s16(src1 + j);
int16x8_t line2x = vld1q_s16(src2 + j);
int16x8_t line0y = vld1q_s16(src0 + j + width);
int16x8_t line2y = vld1q_s16(src2 + j + width);
int16x8_t l02 = vaddq_s16(line0x, line2x);
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
int16x8_t dy = vsubq_s16(line2y, line0y);
int16x8_t dx = vaddq_s16(l1x2, l02);
int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx));
int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy));
norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy));
normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx));
vst1q_s16(dsty + j, dy);
vst1q_s16(dstx + j, dx);
vst1q_s32(mag + j, norml);
vst1q_s32(mag + j + 4, normh);
}
if (j != width)
{
j = width - 8;
goto ColFilter3x3CannyL2Loop;
}
}
template <bool L2gradient>
inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
{
ptrdiff_t j = 0;
if (colscn >= 8)
{
int16x8_t vx = vld1q_s16(_dx);
int16x8_t vy = vld1q_s16(_dy);
for (; j <= colscn - 16; j+=8)
{
internal::prefetch(_dx);
internal::prefetch(_dy);
int16x8_t vx2 = vld1q_s16(_dx + j + 8);
int16x8_t vy2 = vld1q_s16(_dy + j + 8);
int16x8_t vabsx = vabsq_s16(vx);
int16x8_t vabsy = vabsq_s16(vy);
int16x8_t norm = vaddq_s16(vabsx, vabsy);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s32(_norm + j + 4, normh);
vst1q_s32(_norm + j + 0, norml);
vx = vx2;
vy = vy2;
}
int16x8_t vabsx = vabsq_s16(vx);
int16x8_t vabsy = vabsq_s16(vy);
int16x8_t norm = vaddq_s16(vabsx, vabsy);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s32(_norm + j + 4, normh);
vst1q_s32(_norm + j + 0, norml);
}
for (; j < colscn; j++)
_norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j]));
}
template <>
inline void NormCanny<true>(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
{
ptrdiff_t j = 0;
if (colscn >= 8)
{
int16x8_t vx = vld1q_s16(_dx);
int16x8_t vy = vld1q_s16(_dy);
for (; j <= colscn - 16; j+=8)
{
internal::prefetch(_dx);
internal::prefetch(_dy);
int16x8_t vxnext = vld1q_s16(_dx + j + 8);
int16x8_t vynext = vld1q_s16(_dy + j + 8);
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
vst1q_s32(_norm + j + 0, norml);
vst1q_s32(_norm + j + 4, normh);
vx = vxnext;
vy = vynext;
}
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
vst1q_s32(_norm + j + 0, norml);
vst1q_s32(_norm + j + 4, normh);
}
for (; j < colscn; j++)
_norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j];
}
template <bool L2gradient>
inline void prepareThresh(f64 low_thresh, f64 high_thresh,
s32 &low, s32 &high)
{
if (low_thresh > high_thresh)
std::swap(low_thresh, high_thresh);
#if defined __GNUC__
low = (s32)low_thresh;
high = (s32)high_thresh;
low -= (low > low_thresh);
high -= (high > high_thresh);
#else
low = internal::round(low_thresh);
high = internal::round(high_thresh);
f32 ldiff = (f32)(low_thresh - low);
f32 hdiff = (f32)(high_thresh - high);
low -= (ldiff < 0);
high -= (hdiff < 0);
#endif
}
template <>
inline void prepareThresh<true>(f64 low_thresh, f64 high_thresh,
s32 &low, s32 &high)
{
if (low_thresh > high_thresh)
std::swap(low_thresh, high_thresh);
if (low_thresh > 0) low_thresh *= low_thresh;
if (high_thresh > 0) high_thresh *= high_thresh;
#if defined __GNUC__
low = (s32)low_thresh;
high = (s32)high_thresh;
low -= (low > low_thresh);
high -= (high > high_thresh);
#else
low = internal::round(low_thresh);
high = internal::round(high_thresh);
f32 ldiff = (f32)(low_thresh - low);
f32 hdiff = (f32)(high_thresh - high);
low -= (ldiff < 0);
high -= (hdiff < 0);
#endif
}
template <bool L2gradient, bool externalSobel>
struct _normEstimator
{
ptrdiff_t magstep;
ptrdiff_t dxOffset;
ptrdiff_t dyOffset;
ptrdiff_t shxOffset;
ptrdiff_t shyOffset;
std::vector<u8> buffer;
const ptrdiff_t offsetk;
ptrdiff_t borderyt, borderyb;
RowFilter3x3Canny sobelRow;
inline _normEstimator(const Size2D &size, s32, Margin borderMargin,
ptrdiff_t &mapstep, s32** mag_buf, u8* &map):
offsetk(1),
sobelRow(std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left),
std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right))
{
mapstep = size.width + 2;
magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32));
dxOffset = mapstep * sizeof(s32)/sizeof(s16);
dyOffset = dxOffset + size.width * 1;
shxOffset = dxOffset + size.width * 2;
shyOffset = dxOffset + size.width * 3;
buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) );
mag_buf[0] = (s32*)&buffer[0];
mag_buf[1] = mag_buf[0] + magstep;
mag_buf[2] = mag_buf[1] + magstep;
memset(mag_buf[0], 0, mapstep * sizeof(s32));
map = (u8*)(mag_buf[2] + magstep);
memset(map, 1, mapstep);
memset(map + mapstep*(size.height + 1), 1, mapstep);
borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
}
inline void firstRow(const Size2D &size, s32,
const u8 *srcBase, ptrdiff_t srcStride,
s16*, ptrdiff_t,
s16*, ptrdiff_t,
s32** mag_buf)
{
//sobelH row #0
const u8* _src = internal::getRowPtr(srcBase, srcStride, 0);
sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width);
//sobelH row #1
_src = internal::getRowPtr(srcBase, srcStride, 1);
sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width);
mag_buf[1][0] = mag_buf[1][size.width+1] = 0;
if (borderyt == 0)
{
//sobelH row #-1
_src = internal::getRowPtr(srcBase, srcStride, -1);
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
}
else
{
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
}
}
inline void nextRow(const Size2D &size, s32,
const u8 *srcBase, ptrdiff_t srcStride,
s16*, ptrdiff_t,
s16*, ptrdiff_t,
const ptrdiff_t &mapstep, s32** mag_buf,
size_t i, const s16* &_x, const s16* &_y)
{
mag_buf[2][0] = mag_buf[2][size.width+1] = 0;
if (i < size.height - borderyb)
{
const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1);
//sobelH row #i+1
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset,
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
}
else if (i < size.height)
{
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
}
else
memset(mag_buf[2], 0, mapstep*sizeof(s32));
_x = ((s16*)mag_buf[1]) + dxOffset;
_y = ((s16*)mag_buf[1]) + dyOffset;
}
};
template <bool L2gradient>
struct _normEstimator<L2gradient, true>
{
std::vector<u8> buffer;
inline _normEstimator(const Size2D &size, s32 cn, Margin,
ptrdiff_t &mapstep, s32** mag_buf, u8* &map)
{
mapstep = size.width + 2;
buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) );
mag_buf[0] = (s32*)&buffer[0];
mag_buf[1] = mag_buf[0] + mapstep*cn;
mag_buf[2] = mag_buf[1] + mapstep*cn;
memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32));
map = (u8*)(mag_buf[2] + mapstep*cn);
memset(map, 1, mapstep);
memset(map + mapstep*(size.height + 1), 1, mapstep);
}
inline void firstRow(const Size2D &size, s32 cn,
const u8 *, ptrdiff_t,
s16* dxBase, ptrdiff_t dxStride,
s16* dyBase, ptrdiff_t dyStride,
s32** mag_buf)
{
s32* _norm = mag_buf[1] + 1;
s16* _dx = internal::getRowPtr(dxBase, dxStride, 0);
s16* _dy = internal::getRowPtr(dyBase, dyStride, 0);
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
if(cn > 1)
{
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
{
size_t maxIdx = jn;
for(s32 k = 1; k < cn; ++k)
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
_norm[j] = _norm[maxIdx];
_dx[j] = _dx[maxIdx];
_dy[j] = _dy[maxIdx];
}
}
_norm[-1] = _norm[size.width] = 0;
}
inline void nextRow(const Size2D &size, s32 cn,
const u8 *, ptrdiff_t,
s16* dxBase, ptrdiff_t dxStride,
s16* dyBase, ptrdiff_t dyStride,
const ptrdiff_t &mapstep, s32** mag_buf,
size_t i, const s16* &_x, const s16* &_y)
{
s32* _norm = mag_buf[(i > 0) + 1] + 1;
if (i < size.height)
{
s16* _dx = internal::getRowPtr(dxBase, dxStride, i);
s16* _dy = internal::getRowPtr(dyBase, dyStride, i);
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
if(cn > 1)
{
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
{
size_t maxIdx = jn;
for(s32 k = 1; k < cn; ++k)
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
_norm[j] = _norm[maxIdx];
_dx[j] = _dx[maxIdx];
_dy[j] = _dy[maxIdx];
}
}
_norm[-1] = _norm[size.width] = 0;
}
else
memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32));
_x = internal::getRowPtr(dxBase, dxStride, i-1);
_y = internal::getRowPtr(dyBase, dyStride, i-1);
}
};
template <bool L2gradient, bool externalSobel>
inline void Canny3x3(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
s32 low, high;
prepareThresh<L2gradient>(low_thresh, high_thresh, low, high);
ptrdiff_t mapstep;
s32* mag_buf[3];
u8* map;
_normEstimator<L2gradient, externalSobel> normEstimator(size, cn, borderMargin, mapstep, mag_buf, map);
size_t maxsize = std::max<size_t>( 1u << 10, size.width * size.height / 10 );
std::vector<u8*> stack( maxsize );
u8 **stack_top = &stack[0];
u8 **stack_bottom = &stack[0];
/* sector numbers
(Top-Left Origin)
1 2 3
* * *
* * *
0*******0
* * *
* * *
3 2 1
*/
#define CANNY_PUSH(d) *(d) = u8(2), *stack_top++ = (d)
#define CANNY_POP(d) (d) = *--stack_top
//i == 0
normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf);
// calculate magnitude and angle of gradient, perform non-maxima supression.
// fill the map with one of the following values:
// 0 - the pixel might belong to an edge
// 1 - the pixel can not belong to an edge
// 2 - the pixel does belong to an edge
for (size_t i = 1; i <= size.height; i++)
{
const s16 *_x, *_y;
normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y);
u8* _map = map + mapstep*i + 1;
_map[-1] = _map[size.width] = 1;
s32* _mag = mag_buf[1] + 1; // take the central row
ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
if ((stack_top - stack_bottom) + size.width > maxsize)
{
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
maxsize = maxsize * 3/2;
stack.resize(maxsize);
stack_bottom = &stack[0];
stack_top = stack_bottom + sz;
}
s32 prev_flag = 0;
for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++)
{
#define CANNY_SHIFT 15
const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
s32 m = _mag[j];
if (m > low)
{
s32 xs = _x[j];
s32 ys = _y[j];
s32 x = abs(xs);
s32 y = abs(ys) << CANNY_SHIFT;
s32 tg22x = x * TG22;
if (y < tg22x)
{
if (m > _mag[j-1] && m >= _mag[j+1]) goto __push;
}
else
{
s32 tg67x = tg22x + (x << (CANNY_SHIFT+1));
if (y > tg67x)
{
if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push;
}
else
{
s32 s = (xs ^ ys) < 0 ? -1 : 1;
if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push;
}
}
}
prev_flag = 0;
_map[j] = u8(1);
continue;
__push:
if (!prev_flag && m > high && _map[j-mapstep] != 2)
{
CANNY_PUSH(_map + j);
prev_flag = 1;
}
else
_map[j] = 0;
}
// scroll the ring buffer
_mag = mag_buf[0];
mag_buf[0] = mag_buf[1];
mag_buf[1] = mag_buf[2];
mag_buf[2] = _mag;
}
// now track the edges (hysteresis thresholding)
while (stack_top > stack_bottom)
{
u8* m;
if ((size_t)(stack_top - stack_bottom) + 8u > maxsize)
{
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
maxsize = maxsize * 3/2;
stack.resize(maxsize);
stack_bottom = &stack[0];
stack_top = stack_bottom + sz;
}
CANNY_POP(m);
if (!m[-1]) CANNY_PUSH(m - 1);
if (!m[1]) CANNY_PUSH(m + 1);
if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
if (!m[-mapstep]) CANNY_PUSH(m - mapstep);
if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
if (!m[mapstep-1]) CANNY_PUSH(m + mapstep - 1);
if (!m[mapstep]) CANNY_PUSH(m + mapstep);
if (!m[mapstep+1]) CANNY_PUSH(m + mapstep + 1);
}
// the final pass, form the final image
uint8x16_t v2 = vmovq_n_u8(2);
const u8* ptrmap = map + mapstep + 1;
for (size_t i = 0; i < size.height; i++, ptrmap += mapstep)
{
u8* _dst = internal::getRowPtr(dstBase, dstStride, i);
ptrdiff_t j = 0;
for (; j < (ptrdiff_t)size.width - 16; j += 16)
{
internal::prefetch(ptrmap);
uint8x16_t vmap = vld1q_u8(ptrmap + j);
uint8x16_t vdst = vceqq_u8(vmap, v2);
vst1q_u8(_dst+j, vdst);
}
for (; j < (ptrdiff_t)size.width; j++)
_dst[j] = (u8)-(ptrmap[j] >> 1);
}
}
} // namespace
#endif
bool isCanny3x3Supported(const Size2D &size)
{
return isSupportedConfiguration() &&
size.height >= 2 && size.width >= 9;
}
void Canny3x3L1(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
#ifdef CAROTENE_NEON
Canny3x3<false, false>(size, 1,
srcBase, srcStride,
dstBase, dstStride,
NULL, 0,
NULL, 0,
low_thresh, high_thresh,
borderMargin);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)low_thresh;
(void)high_thresh;
(void)borderMargin;
#endif
}
void Canny3x3L2(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
#ifdef CAROTENE_NEON
Canny3x3<true, false>(size, 1,
srcBase, srcStride,
dstBase, dstStride,
NULL, 0,
NULL, 0,
low_thresh, high_thresh,
borderMargin);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)low_thresh;
(void)high_thresh;
(void)borderMargin;
#endif
}
void Canny3x3L1(const Size2D &size, s32 cn,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Canny3x3<false, true>(size, cn,
NULL, 0,
dstBase, dstStride,
dxBase, dxStride,
dyBase, dyStride,
low_thresh, high_thresh,
Margin());
#else
(void)size;
(void)cn;
(void)dstBase;
(void)dstStride;
(void)dxBase;
(void)dxStride;
(void)dyBase;
(void)dyStride;
(void)low_thresh;
(void)high_thresh;
#endif
}
void Canny3x3L2(const Size2D &size, s32 cn,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Canny3x3<true, true>(size, cn,
NULL, 0,
dstBase, dstStride,
dxBase, dxStride,
dyBase, dyStride,
low_thresh, high_thresh,
Margin());
#else
(void)size;
(void)cn;
(void)dstBase;
(void)dstStride;
(void)dxBase;
(void)dxStride;
(void)dyBase;
(void)dyStride;
(void)low_thresh;
(void)high_thresh;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,486 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
void extract2(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 64, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x2_t v_src = vld2q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld2q_u8(src + sj + 32);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 16, dj += 8)
{
uint8x8x2_t v_src = vld2_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 2, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
void extract3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 96, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x3_t v_src = vld3q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld3q_u8(src + sj + 48);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 24, dj += 8)
{
uint8x8x3_t v_src = vld3_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 3, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
void extract4(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 128, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x4_t v_src = vld4q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld4q_u8(src + sj + 64);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 32, dj += 8)
{
uint8x8x4_t v_src = vld4_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 4, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
#define FILL_LINES2(macro,type) \
macro##_LINE(type,0) \
macro##_LINE(type,1)
#define FILL_LINES3(macro,type) \
FILL_LINES2(macro,type) \
macro##_LINE(type,2)
#define FILL_LINES4(macro,type) \
FILL_LINES3(macro,type) \
macro##_LINE(type,3)
#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
#ifdef CAROTENE_NEON
#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
#define SST_LINE(type, n) dst##n[dj] = src[sj + n];
#define MUL2(val) (val << 1)
#define MUL3(val) (MUL2(val) + val)
#define MUL4(val) (val << 2)
#define CONTDST2 srcStride == dst0Stride && \
srcStride == dst1Stride &&
#define CONTDST3 srcStride == dst0Stride && \
srcStride == dst1Stride && \
srcStride == dst2Stride &&
#define CONTDST4 srcStride == dst0Stride && \
srcStride == dst1Stride && \
srcStride == dst2Stride && \
srcStride == dst3Stride &&
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT_ASM2(sgn, bits) __asm__ ( \
"vld2." #bits " {d0, d2}, [%[in0]] \n\t" \
"vld2." #bits " {d1, d3}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3" \
);
#define SPLIT_ASM3(sgn, bits) __asm__ ( \
"vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \
"vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5" \
);
#define SPLIT_ASM4(sgn, bits) __asm__ ( \
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
"vst1." #bits " {d6-d7}, [%[out3]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
);
#define SPLIT_QUAD(sgn, bits, n) { \
internal::prefetch(src + sj); \
SPLIT_ASM##n(sgn, bits) \
}
#else
#define SPLIT_QUAD(sgn, bits, n) { \
internal::prefetch(src + sj); \
vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
FILL_LINES##n(VST1Q, sgn##bits) \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \
const sgn##bits * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##bits) ) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTDST##n \
dst0Stride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
FILL_LINES##n(VROW, sgn##bits) \
size_t sj = 0u, dj = 0u; \
\
for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \
SPLIT_QUAD(sgn, bits, n) \
\
if (dj < roiw8) \
{ \
vec64 v_src = vld##n##_##sgn##bits(src + sj); \
FILL_LINES##n(VST1, sgn##bits) \
sj += MUL##n(8)/sizeof(sgn##bits); \
dj += 8/sizeof(sgn##bits); \
} \
\
for (; dj < size.width; sj += n, ++dj) \
{ \
FILL_LINES##n(SST, sgn##bits) \
} \
} \
}
#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \
const sgn##64 * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##64) ) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTDST##n \
dst0Stride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \
FILL_LINES##n(VROW, sgn##64) \
size_t sj = 0u, dj = 0u; \
\
for (; dj < size.width; sj += n, ++dj) \
{ \
vec64 v_src = vld##n##_##sgn##64(src + sj); \
FILL_LINES##n(VST1, sgn##64) \
} \
} \
}
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define ALPHA_QUAD(sgn, bits) { \
internal::prefetch(src + sj); \
__asm__ ( \
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
"vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \
"vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \
"vst1." #bits " {d6-d7}, [%[out1]] \n\t" \
: \
: [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
[in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
); \
}
#else
#define ALPHA_QUAD(sgn, bits) { \
internal::prefetch(src + sj); \
union { vec128_4 v4; vec128_3 v3; } vals; \
vals.v4 = vld4q_##sgn##bits(src + sj); \
vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \
const sgn##bits * srcBase, ptrdiff_t srcStride, \
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (srcStride == dst3Stride && \
srcStride == dst1Stride && \
srcStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4; \
typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4; \
typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \
sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \
size_t sj = 0u, d3j = 0u, d1j = 0u; \
\
for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \
d1j += 16/sizeof(sgn##bits)) \
ALPHA_QUAD(sgn, bits) \
\
if (d1j < roiw8) \
{ \
union { vec64_4 v4; vec64_3 v3; } vals; \
vals.v4 = vld4_##sgn##bits(src + sj); \
vst3_u8(dst3 + d3j, vals.v3); \
vst1_u8(dst1 + d1j, vals.v4.val[3]); \
sj += MUL4(8)/sizeof(sgn##bits); \
d3j += MUL3(8)/sizeof(sgn##bits); \
d1j += 8/sizeof(sgn##bits); \
} \
\
for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \
{ \
dst3[d3j+0] = src[sj + 0]; \
dst3[d3j+1] = src[sj + 1]; \
dst3[d3j+2] = src[sj + 2]; \
dst1[d1j] = src[sj + 3]; \
} \
} \
}
#else
#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \
const sgn##bits * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##bits) ) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)srcBase; \
(void)srcStride; \
FILL_LINES##n(VOID, sgn##bits) \
}
#define SPLIT64(sgn,n) SPLIT(sgn,64,n)
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \
const sgn##bits * srcBase, ptrdiff_t srcStride, \
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)srcBase; \
(void)srcStride; \
(void)dst3Base; \
(void)dst3Stride; \
(void)dst1Base; \
(void)dst1Stride; \
}
#endif //CAROTENE_NEON
SPLIT(u, 8,2)
SPLIT(u, 8,3)
SPLIT(u, 8,4)
SPLIT(u,16,2)
SPLIT(u,16,3)
SPLIT(u,16,4)
SPLIT(s,32,2)
SPLIT(s,32,3)
SPLIT(s,32,4)
SPLIT64(s, 2)
SPLIT64(s, 3)
SPLIT64(s, 4)
SPLIT4ALPHA(u,8)
} // namespace CAROTENE_NS

View File

@ -0,0 +1,389 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#define FILL_LINES2(macro,type) \
macro##_LINE(type,0) \
macro##_LINE(type,1)
#define FILL_LINES3(macro,type) \
FILL_LINES2(macro,type) \
macro##_LINE(type,2)
#define FILL_LINES4(macro,type) \
FILL_LINES3(macro,type) \
macro##_LINE(type,3)
#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
#ifdef CAROTENE_NEON
#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
#define PREF_LINE(type, n) internal::prefetch(src##n + sj);
#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
#define SLD_LINE(type, n) dst[dj + n] = src##n[sj];
#define MUL2(val) (val << 1)
#define MUL3(val) (MUL2(val) + val)
#define MUL4(val) (val << 2)
#define CONTSRC2 dstStride == src0Stride && \
dstStride == src1Stride &&
#define CONTSRC3 dstStride == src0Stride && \
dstStride == src1Stride && \
dstStride == src2Stride &&
#define CONTSRC4 dstStride == src0Stride && \
dstStride == src1Stride && \
dstStride == src2Stride && \
dstStride == src3Stride &&
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define MERGE_ASM2(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vst2." #bits " {d0, d2}, [%[out0]] \n\t" \
"vst2." #bits " {d1, d3}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3" \
);
#define MERGE_ASM3(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
"vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \
"vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5" \
);
#define MERGE_ASM4(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
"vld1." #bits " {d6-d7}, [%[in3]] \n\t" \
"vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \
"vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
);
#define MERGE_QUAD(sgn, bits, n) { \
FILL_LINES##n(PREF, sgn##bits) \
MERGE_ASM##n(sgn, bits) \
}
#else
#define MERGE_QUAD(sgn, bits, n) { \
vec128 v_dst; \
/*FILL_LINES##n(PREF, sgn##bits) \
FILL_LINES##n(VLD1Q, sgn##bits)*/ \
FILL_LINES##n(PRLD, sgn##bits) \
vst##n##q_##sgn##bits(dst + dj, v_dst); \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \
FILL_LINES##n(FARG, sgn##bits), \
sgn##bits * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTSRC##n \
dstStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
FILL_LINES##n(VROW, sgn##bits) \
sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \
size_t sj = 0u, dj = 0u; \
\
for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \
MERGE_QUAD(sgn, bits, n) \
\
if ( sj < roiw8 ) \
{ \
vec64 v_dst; \
FILL_LINES##n(VLD1, sgn##bits) \
vst##n##_##sgn##bits(dst + dj, v_dst); \
sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \
} \
\
for (; sj < size.width; ++sj, dj += n) \
{ \
FILL_LINES##n(SLD, sgn##bits) \
} \
} \
}
#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \
FILL_LINES##n(FARG, sgn##64), \
sgn##64 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTSRC##n \
dstStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
FILL_LINES##n(VROW, sgn##64) \
sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \
size_t sj = 0u, dj = 0u; \
\
for (; sj < size.width; ++sj, dj += n) \
{ \
vec64 v_dst; \
FILL_LINES##n(VLD1, sgn##64) \
vst##n##_##sgn##64(dst + dj, v_dst); \
/*FILL_LINES##n(SLD, sgn##64)*/ \
} \
} \
}
#else
#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \
FILL_LINES##n(FARG, sgn##bits), \
sgn##bits * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
FILL_LINES##n(VOID, sgn##bits) \
(void)dstBase; \
(void)dstStride; \
}
#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
#endif //CAROTENE_NEON
COMBINE(u, 8,2)
COMBINE(u, 8,3)
COMBINE(u, 8,4)
COMBINE(u,16,2)
COMBINE(u,16,3)
COMBINE(u,16,4)
COMBINE(s,32,2)
COMBINE(s,32,3)
COMBINE(s,32,4)
COMBINE64(s, 2)
COMBINE64(s, 3)
COMBINE64(s, 4)
void combineYUYV(const Size2D &size,
const u8 * srcyBase, ptrdiff_t srcyStride,
const u8 * srcuBase, ptrdiff_t srcuStride,
const u8 * srcvBase, ptrdiff_t srcvStride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; i += 1)
{
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t syj = 0u, sj = 0u, dj = 0u;
#ifndef ANDROID
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
{
internal::prefetch(srcy + syj);
internal::prefetch(srcu + sj);
internal::prefetch(srcv + sj);
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
uint8x16x4_t v_dst;
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1q_u8(srcu + sj);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1q_u8(srcv + sj);
vst4q_u8(dst + dj, v_dst);
v_y = vld2q_u8(srcy + syj + 32);
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1q_u8(srcu + sj + 16);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1q_u8(srcv + sj + 16);
vst4q_u8(dst + dj + 64, v_dst);
}
#endif
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
{
uint8x8x2_t v_y = vld2_u8(srcy + syj);
uint8x8x4_t v_dst;
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1_u8(srcu + sj);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1_u8(srcv + sj);
vst4_u8(dst + dj, v_dst);
}
for (; sj < size.width; ++sj, syj += 2, dj += 4)
{
dst[dj] = srcy[syj];
dst[dj + 1] = srcu[sj];
dst[dj + 2] = srcy[syj + 1];
dst[dj + 3] = srcv[sj];
}
}
#else
(void)size;
(void)srcyBase;
(void)srcyStride;
(void)srcuBase;
(void)srcuStride;
(void)srcvBase;
(void)srcvStride;
(void)dstBase;
(void)dstStride;
#endif
}
void combineUYVY(const Size2D &size,
const u8 * srcyBase, ptrdiff_t srcyStride,
const u8 * srcuBase, ptrdiff_t srcuStride,
const u8 * srcvBase, ptrdiff_t srcvStride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t syj = 0u, sj = 0u, dj = 0u;
#ifndef ANDROID
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
{
internal::prefetch(srcy + syj);
internal::prefetch(srcu + sj);
internal::prefetch(srcv + sj);
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
uint8x16x4_t v_dst;
v_dst.val[0] = vld1q_u8(srcu + sj);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1q_u8(srcv + sj);
v_dst.val[3] = v_y.val[1];
vst4q_u8(dst + dj, v_dst);
v_y = vld2q_u8(srcy + syj + 32);
v_dst.val[0] = vld1q_u8(srcu + sj + 16);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1q_u8(srcv + sj + 16);
v_dst.val[3] = v_y.val[1];
vst4q_u8(dst + dj + 64, v_dst);
}
#endif
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
{
uint8x8x2_t v_y = vld2_u8(srcy + syj);
uint8x8x4_t v_dst;
v_dst.val[0] = vld1_u8(srcu + sj);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1_u8(srcv + sj);
v_dst.val[3] = v_y.val[1];
vst4_u8(dst + dj, v_dst);
}
for (; sj < size.width; ++sj, syj += 2, dj += 4)
{
dst[dj] = srcu[sj];
dst[dj + 1] = srcy[syj];
dst[dj + 2] = srcv[sj];
dst[dj + 3] = srcy[syj + 1];
}
}
#else
(void)size;
(void)srcyBase;
(void)srcyStride;
(void)srcuBase;
(void)srcuStride;
(void)srcvBase;
(void)srcvStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

340
3rdparty/carotene/src/cmp.cpp vendored Normal file
View File

@ -0,0 +1,340 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename Op, int elsize> struct vtail
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
//do nothing since there couldn't be enough data
(void)src0;
(void)src1;
(void)dst;
(void)op;
(void)x;
(void)width;
}
};
template <typename Op> struct vtail<Op, 2>
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 v_src0, v_src1;
uvec128 v_dst;
v_src0 = internal::vld1q(src0 + x);
v_src1 = internal::vld1q(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1(dst + x, internal::vmovn(v_dst));
x+=8;
}
}
};
template <typename Op> struct vtail<Op, 1>
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<type>::vec64 vec64;
typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 v_src0, v_src1;
uvec128 v_dst;
v_src0 = internal::vld1q(src0 + x);
v_src1 = internal::vld1q(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1q(dst + x, v_dst);
x+=16;
}
if( x + 8 < width)
{
vec64 v_src0, v_src1;
uvec64 v_dst;
v_src0 = internal::vld1(src0 + x);
v_src1 = internal::vld1(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1(dst + x, v_dst);
x+=8;
}
}
};
template <typename Op>
void vcompare(Size2D size,
const typename Op::type * src0Base, ptrdiff_t src0Stride,
const typename Op::type * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride, const Op & op)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
if (src0Stride == src1Stride && src0Stride == dstStride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
{
size.width *= size.height;
size.height = 1;
}
const u32 step_base = 32 / sizeof(type);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
for (size_t y = 0; y < size.height; ++y)
{
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for( ; x < roiw_base; x += step_base )
{
internal::prefetch(src0 + x);
internal::prefetch(src1 + x);
vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
uvec128 v_dst0;
uvec128 v_dst1;
op(v_src00, v_src10, v_dst0);
op(v_src01, v_src11, v_dst1);
vnst(dst + x, v_dst0, v_dst1);
}
vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);
for (; x < size.width; ++x)
{
op(src0 + x, src1 + x, dst + x);
}
}
}
template<typename T>
struct OpCmpEQ
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vceqq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vceq(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] == src1[0] ? 255 : 0;
}
};
template<typename T>
struct OpCmpNE
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] == src1[0] ? 0 : 255;
}
};
template<typename T>
struct OpCmpGT
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vcgtq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vcgt(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] > src1[0] ? 255 : 0;
}
};
template<typename T>
struct OpCmpGE
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vcgeq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vcge(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] >= src1[0] ? 255 : 0;
}
};
}
#define IMPL_CMPOP(op, type) \
void cmp##op(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
u8 *dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
vcompare(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, \
OpCmp##op<type>()); \
}
#else
#define IMPL_CMPOP(op, type) \
void cmp##op(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
u8 *dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)src0Base; \
(void)src0Stride; \
(void)src1Base; \
(void)src1Stride; \
(void)dstBase; \
(void)dstStride; \
}
#endif
IMPL_CMPOP(EQ, u8)
IMPL_CMPOP(EQ, s8)
IMPL_CMPOP(EQ, u16)
IMPL_CMPOP(EQ, s16)
IMPL_CMPOP(EQ, u32)
IMPL_CMPOP(EQ, s32)
IMPL_CMPOP(EQ, f32)
IMPL_CMPOP(NE, u8)
IMPL_CMPOP(NE, s8)
IMPL_CMPOP(NE, u16)
IMPL_CMPOP(NE, s16)
IMPL_CMPOP(NE, u32)
IMPL_CMPOP(NE, s32)
IMPL_CMPOP(NE, f32)
IMPL_CMPOP(GT, u8)
IMPL_CMPOP(GT, s8)
IMPL_CMPOP(GT, u16)
IMPL_CMPOP(GT, s16)
IMPL_CMPOP(GT, u32)
IMPL_CMPOP(GT, s32)
IMPL_CMPOP(GT, f32)
IMPL_CMPOP(GE, u8)
IMPL_CMPOP(GE, s8)
IMPL_CMPOP(GE, u16)
IMPL_CMPOP(GE, s16)
IMPL_CMPOP(GE, u32)
IMPL_CMPOP(GE, s32)
IMPL_CMPOP(GE, f32)
} // namespace CAROTENE_NS

2846
3rdparty/carotene/src/colorconvert.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

108
3rdparty/carotene/src/common.cpp vendored Normal file
View File

@ -0,0 +1,108 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <cstdlib>
#include <iostream>
#include "common.hpp"
namespace CAROTENE_NS {
bool isSupportedConfiguration()
{
#ifdef CAROTENE_NEON
return true;
#else
return false;
#endif
}
namespace internal {
void assertSupportedConfiguration(bool parametersSupported)
{
if (!isSupportedConfiguration()) {
std::cerr << "internal error: attempted to use an unavailable function" << std::endl;
std::abort();
}
if (!parametersSupported) {
std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl;
std::abort();
}
}
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin)
{
ptrdiff_t p = _p + (ptrdiff_t)startMargin;
size_t len = _len + startMargin + endMargin;
if( (size_t)p < len )
return _p;
else if( borderType == BORDER_MODE_REPLICATE )
p = p < 0 ? 0 : (ptrdiff_t)len - 1;
else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 )
{
s32 delta = borderType == BORDER_MODE_REFLECT101;
if( len == 1 )
return 0;
do
{
if( p < 0 )
p = -p - 1 + delta;
else
p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta;
}
while( (size_t)p >= len );
}
else if( borderType == BORDER_MODE_WRAP )
{
if( p < 0 )
p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len;
if( p >= (ptrdiff_t)len )
p %= (ptrdiff_t)len;
}
else if( borderType == BORDER_MODE_CONSTANT )
p = -1;
else
internal::assertSupportedConfiguration(false);
return p - (ptrdiff_t)startMargin;
}
} // namespace internal
} // namespace CAROTENE_NS

97
3rdparty/carotene/src/common.hpp vendored Normal file
View File

@ -0,0 +1,97 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_COMMON_HPP
#define CAROTENE_SRC_COMMON_HPP
#include <cstddef>
#include <cstdlib>
#include <algorithm>
#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON)
#define CAROTENE_NEON
#endif
#ifdef CAROTENE_NEON
#include <arm_neon.h>
#include "intrinsics.hpp"
#endif
#include <carotene/functions.hpp>
#include "saturate_cast.hpp"
namespace CAROTENE_NS { namespace internal {
inline void prefetch(const void *ptr, size_t offset = 32*10)
{
#if defined __GNUC__
__builtin_prefetch(reinterpret_cast<const char*>(ptr) + offset);
#elif defined _MSC_VER && defined CAROTENE_NEON
__prefetch(reinterpret_cast<const char*>(ptr) + offset);
#else
(void)ptr;
(void)offset;
#endif
}
template <typename T>
inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row)
{
char *baseRaw = const_cast<char *>(reinterpret_cast<const char *>(base));
return reinterpret_cast<T *>(baseRaw + ptrdiff_t(row) * stride);
}
void assertSupportedConfiguration(bool parametersSupported = true);
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0);
/*!
* Aligns pointer by the certain number of bytes
*
* This small inline function aligns the pointer by the certain number of bytes by shifting
* it forward by 0 or a positive offset.
*/
template<typename T> inline T* alignPtr(T* ptr, size_t n=sizeof(T))
{
return (T*)(((size_t)ptr + n-1) & -n);
}
}}
#endif

1331
3rdparty/carotene/src/convert.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

399
3rdparty/carotene/src/convert_depth.cpp vendored Normal file
View File

@ -0,0 +1,399 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <int shift>
void lshiftConst(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
}
for (; j < size.width; j++)
{
dst[j] = ((s16)src[j] << shift);
}
}
}
template <>
void lshiftConst<0>(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
dst[j] = (s16)src[j];
}
}
}
template <int shift>
void rshiftConst(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
vqmovun_s16(v_src1));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
vst1_u8(dst + j, vqmovun_s16(v_src));
}
for (; j < size.width; j++)
{
dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
vmovn_s16(v_src1));
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
}
for (; j < size.width; j++)
{
dst[j] = (u8)((src[j] >> shift));
}
}
}
}
template <>
void rshiftConst<0>(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vqmovun_s16(v_src));
}
for (; j < size.width; j++)
{
dst[j] = internal::saturate_cast<u8>(src[j]);
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
}
for (; j < size.width; j++)
{
dst[j] = (u8)src[j];
}
}
}
}
typedef void (* lshiftConstFunc)(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride);
typedef void (* rshiftConstFunc)(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy);
} // namespace
#endif
void lshift(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
u32 shift)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (shift >= 16u)
{
for (size_t i = 0; i < size.height; ++i)
{
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(s16) * size.width);
}
return;
}
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
lshiftConstFunc funcs[16] =
{
lshiftConst<0>,
lshiftConst<1>,
lshiftConst<2>,
lshiftConst<3>,
lshiftConst<4>,
lshiftConst<5>,
lshiftConst<6>,
lshiftConst<7>,
lshiftConst<8>,
lshiftConst<9>,
lshiftConst<10>,
lshiftConst<11>,
lshiftConst<12>,
lshiftConst<13>,
lshiftConst<14>,
lshiftConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
#endif
}
void rshift(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 shift, CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (shift >= 16)
{
if (cpolicy == CONVERT_POLICY_WRAP)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_zero = vdupq_n_s16(0);
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
vmovn_u16(vcltq_s16(v_src1, v_zero)));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
}
for (; j < size.width; j++)
{
dst[j] = src[j] >= 0 ? 0 : 255;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(u8) * size.width);
}
}
return;
}
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
rshiftConstFunc funcs[16] =
{
rshiftConst<0>,
rshiftConst<1>,
rshiftConst<2>,
rshiftConst<3>,
rshiftConst<4>,
rshiftConst<5>,
rshiftConst<6>,
rshiftConst<7>,
rshiftConst<8>,
rshiftConst<9>,
rshiftConst<10>,
rshiftConst<11>,
rshiftConst<12>,
rshiftConst<13>,
rshiftConst<14>,
rshiftConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
(void)cpolicy;
#endif
}
} // namespace CAROTENE_NS

2498
3rdparty/carotene/src/convert_scale.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

340
3rdparty/carotene/src/convolution.cpp vendored Normal file
View File

@ -0,0 +1,340 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
namespace CAROTENE_NS {
bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE) &&
(ksize.width == 3) && (ksize.height == 3);
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
int32x4_t vshrq_s32(int32x4_t value)
{
return vshrq_n_s32(value, shift);
}
template <>
int32x4_t vshrq_s32<0>(int32x4_t value)
{
return value;
}
} // namespace
typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
#endif
void convolution(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue,
const Size2D & ksize, s16 * kernelBase, u32 scale)
{
internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
#ifdef CAROTENE_NEON
const uint8x8_t v_zero_u8 = vdup_n_u8(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
const int32x4_t v_zero_s32 = vdupq_n_s32(0);
uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
static const vshrq_s32_func vshrq_s32_a[33] =
{
vshrq_s32<0>,
vshrq_s32<1>,
vshrq_s32<2>,
vshrq_s32<3>,
vshrq_s32<4>,
vshrq_s32<5>,
vshrq_s32<6>,
vshrq_s32<7>,
vshrq_s32<8>,
vshrq_s32<9>,
vshrq_s32<10>,
vshrq_s32<11>,
vshrq_s32<12>,
vshrq_s32<13>,
vshrq_s32<14>,
vshrq_s32<15>,
vshrq_s32<16>,
vshrq_s32<17>,
vshrq_s32<18>,
vshrq_s32<19>,
vshrq_s32<20>,
vshrq_s32<21>,
vshrq_s32<22>,
vshrq_s32<23>,
vshrq_s32<24>,
vshrq_s32<25>,
vshrq_s32<26>,
vshrq_s32<27>,
vshrq_s32<28>,
vshrq_s32<29>,
vshrq_s32<30>,
vshrq_s32<31>,
vshrq_s32<32>
};
vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx[3] = { 0, 0, 0 },
currx[3] = { 0, 0, 0 },
nextx[3] = { 0, 0, 0 };
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx[0] = prevx[1] = prevx[2] = borderValue;
else
{
prevx[0] = srow0 ? srow0[x4] : borderValue;
prevx[1] = srow1[x4] ;
prevx[2] = srow2 ? srow2[x4] : borderValue;
}
currx[0] = srow0 ? srow0[x3] : borderValue;
currx[1] = srow1[x3] ;
currx[2] = srow2 ? srow2[x3] : borderValue;
}
// make shift
if (x)
{
tprev[0] = tcurr[0];
tcurr[0] = tnext[0];
tprev[1] = tcurr[1];
tcurr[1] = tnext[1];
tprev[2] = tcurr[2];
tcurr[2] = tnext[2];
}
tnext[0] = x0;
tnext[1] = x1;
tnext[2] = x2;
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr[0] = tcurr[1] = tcurr[2] = v_border;
else if (border == BORDER_MODE_REPLICATE)
{
tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
}
continue;
}
int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[0], tcurr[0], 7);
t1 = tcurr[0];
t2 = vext_u8(tcurr[0], tnext[0], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[1], tcurr[1], 7);
t1 = tcurr[1];
t2 = vext_u8(tcurr[1], tnext[1], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[2], tcurr[2], 7);
t1 = tcurr[2];
t2 = vext_u8(tcurr[2], tnext[2], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
}
// make scale
v_dst0 = vshrq_s32_p(v_dst0);
v_dst1 = vshrq_s32_p(v_dst1);
// and add them
vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
vqmovun_s32(v_dst1))));
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
{
nextx[0] = borderValue;
nextx[1] = borderValue;
nextx[2] = borderValue;
}
else if (border == BORDER_MODE_REPLICATE)
{
nextx[0] = srow0[x];
nextx[1] = srow1[x];
nextx[2] = srow2[x];
}
}
else
{
nextx[0] = srow0 ? srow0[x + 1] : borderValue;
nextx[1] = srow1[x + 1] ;
nextx[2] = srow2 ? srow2[x + 1] : borderValue;
}
s32 val = 0;
for (s32 _y = 0; _y < 3; ++_y)
val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
drow[x] = internal::saturate_cast<u8>(val >> scale);
// make shift
prevx[0] = currx[0];
currx[0] = nextx[0];
prevx[1] = currx[1];
currx[1] = nextx[1];
prevx[2] = currx[2];
currx[2] = nextx[2];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
(void)ksize;
(void)kernelBase;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

430
3rdparty/carotene/src/count_nonzero.cpp vendored Normal file
View File

@ -0,0 +1,430 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <limits>
namespace CAROTENE_NS {
s32 countNonZero(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw16 = size.width & ~15u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO8U_BLOCK_SIZE (16*255)
uint8x16_t vc1 = vmovq_n_u8(1);
for (; i < roiw16;)
{
size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
uint8x16_t vs = vmovq_n_u8(0);
for (; i <= lim; i+= 16)
{
internal::prefetch(src + i);
uint8x16_t vln = vld1q_u8(src + i);
uint8x16_t vnz = vminq_u8(vln, vc1);
vs = vaddq_u8(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const u16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
uint16x8_t vc1 = vmovq_n_u16(1);
for (; i < roiw8;)
{
size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
uint16x8_t vs = vmovq_n_u16(0);
for (; i <= lim; i+= 8)
{
internal::prefetch(src + i);
uint16x8_t vln = vld1q_u16(src + i);
uint16x8_t vnz = vminq_u16(vln, vc1);
vs = vaddq_u16(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vs);
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const s32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);
u32 i = 0;
uint32x4_t vc1 = vmovq_n_u32(1);
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
uint32x4_t vln = vld1q_u32(src + i);
uint32x4_t vnz = vminq_u32(vln, vc1);
vs = vqaddq_u32(vs, vnz);
}
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
float32x4_t vc0 = vmovq_n_f32(0);
int32x4_t vs = vmovq_n_s32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
float32x4_t vln = vld1q_f32(src + i);
int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
vs = vqaddq_s32(vs, vnz);
}
int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
int s[2];
vst1_s32(s, vs2);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f64 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
size_t roiw4 = size.width & ~3u;
size_t roiw2 = size.width & ~1u;
uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
uint32x4_t vc0 = vmovq_n_u32(0);
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f64* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
int32x2_t vs1 = vmov_n_s32(0);
int32x2_t vs2 = vmov_n_s32(0);
int32x2_t vs3 = vmov_n_s32(0);
int32x2_t vs4 = vmov_n_s32(0);
for (; i < roiw8; i += 8 )
{
internal::prefetch(src + i + 6);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint64x2_t vm3 = vandq_u64(vln3, vmask1);
uint64x2_t vm4 = vandq_u64(vln4, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
uint32x4_t vlx3 = vmvnq_u32(vequ3);
uint32x4_t vlx4 = vmvnq_u32(vequ4);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
vs3 = vqadd_s32(vs3, vnz3);
vs4 = vqadd_s32(vs4, vnz4);
}
if (i < roiw4)
{
internal::prefetch(src + i + 2);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
i += 4;
}
if (i < roiw2)
{
internal::prefetch(src + i);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
vs1 = vqadd_s32(vs1, vnz1);
i += 2;
}
vs1 = vqadd_s32(vs1, vs2);
vs3 = vqadd_s32(vs3, vs4);
vs1 = vqadd_s32(vs1, vs3);
int32x2_t vsneg = vqneg_s32(vs1);
s32 s[2];
vst1_s32(s, vsneg);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
} // namespace CAROTENE_NS

708
3rdparty/carotene/src/div.cpp vendored Normal file
View File

@ -0,0 +1,708 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
#include <cfloat>
#include <cmath>
#include <limits>
namespace CAROTENE_NS {
namespace {
#ifdef CAROTENE_NEON
inline float32x4_t vroundq(const float32x4_t& v)
{
const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
return vaddq_f32(v, v_addition);
}
template <typename T>
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
template <>
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
inline float32x2_t vround(const float32x2_t& v)
{
const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
return vadd_f32(v, v_addition);
}
template <typename T>
inline T divSaturate(const T &v1, const T &v2, const float scale)
{
return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
template <>
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
template <typename T>
inline T divWrapQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <>
inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
template <typename T>
inline T divWrap(const T &v1, const T &v2, const float scale)
{
return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <>
inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); }
inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); }
inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); }
inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); }
inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); }
inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); }
inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); }
inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); }
inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
#endif
template <typename T>
void div(const Size2D &size,
const T * src0Base, ptrdiff_t src0Stride,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
(scale * std::numeric_limits<T>::max()) < 1.0f &&
(scale * std::numeric_limits<T>::max()) > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
#ifdef CAROTENE_NEON
template <typename T>
inline T recipSaturateQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipSaturate(const T &v2, const float scale)
{
return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrapQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrap(const T &v2, const float scale)
{
return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
#endif
template <typename T>
void recip(const Size2D &size,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
scale < 1.0f &&
scale > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
}
void div(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
float32x4_t v_zero = vdupq_n_f32(0.0f);
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
internal::vrecpq_f32(v_src1))), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
internal::vrecp_f32(v_src1))), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
void reciprocal(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s8 * srcBase, ptrdiff_t srcStride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
float32x4_t v_zero = vdupq_n_f32(0.0f);
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? 1.0f / src1[j] : 0;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
scale)),v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
scale)), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? scale / src1[j] : 0;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

260
3rdparty/carotene/src/dot_product.cpp vendored Normal file
View File

@ -0,0 +1,260 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
f64 dotProduct(const Size2D &_size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
uint64x2_t ws = vmovq_n_u64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
uint32x4_t s1 = vmovq_n_u32(0);
uint32x4_t s2 = vmovq_n_u32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
uint8x16_t vs1 = vld1q_u8(src0 + i);
uint8x16_t vs2 = vld1q_u8(src1 + i);
uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
s1 = vpadalq_u16(s1, vdot1);
s2 = vpadalq_u16(s2, vdot2);
}
ws = vpadalq_u32(ws, s1);
ws = vpadalq_u32(ws, s2);
}
if(i + 8 <= size.width)
{
uint8x8_t vs1 = vld1_u8(src0 + i);
uint8x8_t vs2 = vld1_u8(src1 + i);
ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
int64x2_t ws = vmovq_n_s64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
int32x4_t s1 = vmovq_n_s32(0);
int32x4_t s2 = vmovq_n_s32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
int8x16_t vs1 = vld1q_s8(src0 + i);
int8x16_t vs2 = vld1q_s8(src1 + i);
int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
s1 = vpadalq_s16(s1, vdot1);
s2 = vpadalq_s16(s2, vdot2);
}
ws = vpadalq_s32(ws, s1);
ws = vpadalq_s32(ws, s2);
}
if(i + 8 <= size.width)
{
int8x8_t vs1 = vld1_s8(src0 + i);
int8x8_t vs2 = vld1_s8(src1 + i);
ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
{
size.width *= size.height;
size.height = 1;
}
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
while(i + 4 <= size.width)
{
size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
float32x4_t v_sum = vdupq_n_f32(0.0f);
for( ; i <= lim; i += 4 )
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
}
float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
}
if(i + 2 <= size.width)
{
float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
i += 2;
}
for (; i < size.width; ++i)
result += src0[i] * src1[i];
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
} // namespace CAROTENE_NS

428
3rdparty/carotene/src/fast.cpp vendored Normal file
View File

@ -0,0 +1,428 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
Below is the original copyright and the references */
/*
Copyright (c) 2006, 2008 Edward Rosten
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
*Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
*Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
*Neither the name of the University of Cambridge nor the names of
its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
The references are:
* Machine learning for high-speed corner detection,
E. Rosten and T. Drummond, ECCV 2006
* Faster and better: A machine learning approach to corner detection
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace
{
void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
{
pixel[0] = 0 + row_stride * 3;
pixel[1] = 1 + row_stride * 3;
pixel[2] = 2 + row_stride * 2;
pixel[3] = 3 + row_stride * 1;
pixel[4] = 3 + row_stride * 0;
pixel[5] = 3 + row_stride * -1;
pixel[6] = 2 + row_stride * -2;
pixel[7] = 1 + row_stride * -3;
pixel[8] = 0 + row_stride * -3;
pixel[9] = -1 + row_stride * -3;
pixel[10] = -2 + row_stride * -2;
pixel[11] = -3 + row_stride * -1;
pixel[12] = -3 + row_stride * 0;
pixel[13] = -3 + row_stride * 1;
pixel[14] = -2 + row_stride * 2;
pixel[15] = -1 + row_stride * 3;
}
u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
{
const s32 K = 8, N = 16 + K + 1;
s32 k, v = ptr[0];
s16 d[(N + 7) & ~7];
for( k = 0; k < N; k++ )
d[k] = (s16)(v - ptr[pixel[k]]);
int16x8_t q0 = vdupq_n_s16((s16)(-1000));
int16x8_t q1 = vdupq_n_s16((s16)(1000));
int16x8_t d0_7 = vld1q_s16(d + 0);
int16x8_t d8_15 = vld1q_s16(d + 8);
int16x8_t d16_23 = vld1q_s16(d + 16);
int16x8_t d24 = vld1q_s16(d + 24);
//k == 0
int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
int16x8_t ak0 = vminq_s16(v0k0, v1k0);
int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 3);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 4);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 5);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 6);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 7);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
ak0 = vminq_s16(ak0, d8_15);
bk0 = vmaxq_s16(bk0, d8_15);
q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
v1k0 = vextq_s16(d8_15, d16_23, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
//k == 8
int16x8_t v0k8 = v1k0;
int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
int16x8_t ak8 = vminq_s16(v0k8, v1k8);
int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 3);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 4);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 5);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 6);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 7);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
ak8 = vminq_s16(ak8, d16_23);
bk8 = vmaxq_s16(bk8, d16_23);
q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
v1k8 = vextq_s16(d16_23, d24, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
//fin
int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
int32x4_t q2w = vmovl_s16(q2);
int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
return (u8)(vget_lane_s32(q8, 0) - 1);
}
} //namespace
#endif
void FAST(const Size2D &size,
u8 *srcBase, ptrdiff_t srcStride,
KeypointStore *keypoints,
u8 threshold, bool nonmax_suppression)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
//keypoints.clear();
const s32 K = 8, N = 16 + K + 1;
ptrdiff_t i, j, k, pixel[N];
makeOffsets(pixel, srcStride);
for(k = 16; k < N; k++)
pixel[k] = pixel[k - 16];
uint8x16_t delta = vdupq_n_u8(128);
uint8x16_t t = vdupq_n_u8(threshold);
uint8x16_t K16 = vdupq_n_u8((u8)K);
u8 threshold_tab[512];
for( i = -255; i <= 255; i++ )
threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
u8* buf[3];
buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
ptrdiff_t* cpbuf[3];
cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
cpbuf[1] = cpbuf[0] + size.width + 1;
cpbuf[2] = cpbuf[1] + size.width + 1;
memset(buf[0], 0, size.width*3);
for(i = 3; i < (ptrdiff_t)size.height-2; i++)
{
const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
u8* curr = buf[(i - 3)%3];
ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
memset(curr, 0, size.width);
ptrdiff_t ncorners = 0;
if( i < (ptrdiff_t)size.height - 3 )
{
j = 3;
for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
{
internal::prefetch(ptr);
internal::prefetch(ptr + pixel[0]);
internal::prefetch(ptr + pixel[2]);
uint8x16_t v0 = vld1q_u8(ptr);
int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
uint8x16_t m0 = vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
uint8x16_t m1 = vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
m0 = vorrq_u8(m0, m1);
u64 mask[2];
vst1q_u64(mask, vreinterpretq_u64_u8(m0));
if( mask[0] == 0 )
{
if (mask[1] != 0)
{
j -= 8;
ptr -= 8;
}
continue;
}
uint8x16_t c0 = vmovq_n_u8(0);
uint8x16_t c1 = vmovq_n_u8(0);
uint8x16_t max0 = vmovq_n_u8(0);
uint8x16_t max1 = vmovq_n_u8(0);
for( k = 0; k < N; k++ )
{
int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
m0 = vcgtq_s8(x, v2);
m1 = vcgtq_s8(v1, x);
c0 = vandq_u8(vsubq_u8(c0, m0), m0);
c1 = vandq_u8(vsubq_u8(c1, m1), m1);
max0 = vmaxq_u8(max0, c0);
max1 = vmaxq_u8(max1, c1);
}
max0 = vmaxq_u8(max0, max1);
u8 m[16];
vst1q_u8(m, vcgtq_u8(max0, K16));
for( k = 0; k < 16; ++k )
if(m[k])
{
cornerpos[ncorners++] = j+k;
if(nonmax_suppression)
curr[j+k] = cornerScore(ptr+k, pixel);
}
}
for( ; j < (s32)size.width - 3; j++, ptr++ )
{
s32 v = ptr[0];
const u8* tab = &threshold_tab[0] - v + 255;
s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
if( d & 1 )
{
s32 vt = v - threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x < vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
if( d & 2 )
{
s32 vt = v + threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x > vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
}
}
cornerpos[-1] = ncorners;
if( i == 3 )
continue;
const u8* prev = buf[(i - 4 + 3)%3];
const u8* pprev = buf[(i - 5 + 3)%3];
cornerpos = cpbuf[(i - 4 + 3)%3];
ncorners = cornerpos[-1];
for( k = 0; k < ncorners; k++ )
{
j = cornerpos[k];
s32 score = prev[j];
if( !nonmax_suppression ||
(score > prev[j+1] && score > prev[j-1] &&
score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
{
keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)keypoints;
(void)threshold;
(void)nonmax_suppression;
#endif
}
} // namespace CAROTENE_NS

442
3rdparty/carotene/src/fill_minmaxloc.cpp vendored Normal file
View File

@ -0,0 +1,442 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void process(const T * src, size_t j0, size_t j1, size_t i,
T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
for (size_t j = j0; j < j1; ++j)
{
T val = src[j];
if (val == maxVal)
{
if (maxLocCount < maxLocCapacity)
{
maxLocPtr[maxLocCount] = j;
maxLocPtr[maxLocCount + 1] = i;
}
maxLocCount += 2;
}
if (val == minVal)
{
if (minLocCount < minLocCapacity)
{
minLocPtr[minLocCount] = j;
minLocPtr[minLocCount + 1] = i;
}
minLocCount += 2;
}
}
}
} // namespace
#endif
void fillMinMaxLocs(const Size2D & size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
vst1q_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
v_minval8 = vdupq_n_u16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint16x8_t v_src = vld1q_u16(src + j);
uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
v_minval8 = vdupq_n_s16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int16x8_t v_src = vld1q_s16(src + j);
uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
v_minval4 = vdupq_n_s32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u32 * srcBase, ptrdiff_t srcStride,
u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
v_minval4 = vdupq_n_u32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
} // namespace CAROTENE_NS

222
3rdparty/carotene/src/flip.cpp vendored Normal file
View File

@ -0,0 +1,222 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
{
bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
return isSupportedConfiguration() &&
((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
(flipMode == FLIP_VERTICAL_MODE));
}
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void flip(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
typedef typename VecTraits<T>::vec128 vec128;
typedef typename VecTraits<T>::vec64 vec64;
u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t js = 0, jd = size.width;
for (; js < roiw_base; js += step_base, jd -= step_base)
{
prefetch(src + js);
vec128 v_src = vld1q(src + js);
vec128 v_dst = vrev64q(v_src);
v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
vst1q(dst + jd - step_base, v_dst);
}
for (; js < roiw_tail; js += step_tail, jd -= step_tail)
{
vec64 v_src = vld1(src + js);
vst1(dst + jd - step_tail, vrev64(v_src));
}
for (--jd; js < size.width; ++js, --jd)
dst[jd] = src[js];
}
}
template <typename T>
void flip3(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
#ifndef ANDROID
typedef typename VecTraits<T, 3>::vec128 vec128;
#endif
typedef typename VecTraits<T, 3>::vec64 vec64;
#ifndef ANDROID
u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
#endif
u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t j = 0, js = 0, jd = size.width * 3;
#ifndef ANDROID
for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
{
prefetch(src + js);
vec128 v_src = vld3q(src + js), v_dst;
v_src.val[0] = vrev64q(v_src.val[0]);
v_src.val[1] = vrev64q(v_src.val[1]);
v_src.val[2] = vrev64q(v_src.val[2]);
v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
vst3q(dst + jd - step_base3, v_dst);
}
#endif // ANDROID
for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
{
vec64 v_src = vld3(src + js), v_dst;
v_dst.val[0] = vrev64(v_src.val[0]);
v_dst.val[1] = vrev64(v_src.val[1]);
v_dst.val[2] = vrev64(v_src.val[2]);
vst3(dst + jd - step_tail3, v_dst);
}
for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
{
dst[jd] = src[js];
dst[jd + 1] = src[js + 1];
dst[jd + 2] = src[js + 2];
}
}
}
typedef void (* flipFunc)(const Size2D &size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode);
} // namespace
#endif
void flip(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode, u32 elemSize)
{
internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
#ifdef CAROTENE_NEON
if (flipMode == FLIP_VERTICAL_MODE)
{
for (size_t y = 0; y < size.height; ++y)
{
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
std::memcpy(dst_row, src_row, elemSize * size.width);
}
return;
}
flipFunc func = NULL;
if (elemSize == (u32)sizeof(u8))
func = &flip<u8>;
if (elemSize == (u32)sizeof(u16))
func = &flip<u16>;
if (elemSize == (u32)sizeof(u32))
func = &flip<u32>;
if (elemSize == (u32)sizeof(u8) * 3)
func = &flip3<u8>;
if (func == NULL)
return;
func(size,
srcBase, srcStride,
dstBase, dstStride,
flipMode);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)flipMode;
(void)elemSize;
#endif
}
} // namespace CAROTENE_NS

1059
3rdparty/carotene/src/gaussian_blur.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

195
3rdparty/carotene/src/in_range.cpp vendored Normal file
View File

@ -0,0 +1,195 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename T, int elsize> struct vtail
{
static inline void inRange(const T *, const T *, const T *,
u8 *, size_t &, size_t)
{
//do nothing since there couldn't be enough data
}
};
template <typename T> struct vtail<T, 2>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1(dst + x, internal::vmovn(vd));
x+=8;
}
}
};
template <typename T> struct vtail<T, 1>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1q(dst + x, vd);
x+=16;
}
if( x + 8 < width)
{
vec64 vs = internal::vld1( src + x);
vec64 vr1 = internal::vld1(rng1 + x);
vec64 vr2 = internal::vld1(rng2 + x);
uvec64 vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
internal::vst1(dst + x, vd);
x+=8;
}
}
};
template <typename T>
inline void inRangeCheck(const Size2D &_size,
const T * srcBase, ptrdiff_t srcStride,
const T * rng1Base, ptrdiff_t rng1Stride,
const T * rng2Base, ptrdiff_t rng2Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
Size2D size(_size);
if (srcStride == dstStride &&
srcStride == rng1Stride &&
srcStride == rng2Stride &&
srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width & ~( 32/sizeof(T) - 1 );
for(size_t j = 0; j < size.height; ++j)
{
const T * src = internal::getRowPtr( srcBase, srcStride, j);
const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
u8 * dst = internal::getRowPtr( dstBase, dstStride, j);
size_t i = 0;
for( ; i < width; i += 32/sizeof(T) )
{
internal::prefetch(src + i);
internal::prefetch(rng1 + i);
internal::prefetch(rng2 + i);
vec128 vs = internal::vld1q( src + i);
vec128 vr1 = internal::vld1q(rng1 + i);
vec128 vr2 = internal::vld1q(rng2 + i);
uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vs = internal::vld1q( src + i + 16/sizeof(T));
vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vnst(dst + i, vd1, vd2);
}
vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
for( ; i < size.width; i++ )
dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
}
}
}
#define INRANGEFUNC(T) \
void inRange(const Size2D &_size, \
const T * srcBase, ptrdiff_t srcStride, \
const T * rng1Base, ptrdiff_t rng1Stride, \
const T * rng2Base, ptrdiff_t rng2Stride, \
u8 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
inRangeCheck(_size, srcBase, srcStride, \
rng1Base, rng1Stride, rng2Base, rng2Stride, \
dstBase, dstStride); \
}
#else
#define INRANGEFUNC(T) \
void inRange(const Size2D &, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
u8 *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
INRANGEFUNC(u8)
INRANGEFUNC(s8)
INRANGEFUNC(u16)
INRANGEFUNC(s16)
INRANGEFUNC(s32)
INRANGEFUNC(f32)
} // namespace CAROTENE_NS

238
3rdparty/carotene/src/integral.cpp vendored Normal file
View File

@ -0,0 +1,238 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
void integral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumBase, ptrdiff_t sumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint32x4_t v_zero = vmovq_n_u32(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
uint32x4_t prev = v_zero;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
uint32x4_t vsumh = vaddw_u16(prev, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
sum = internal::getRowPtr(sumBase, sumStride, i);
prev = v_zero;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint32x4_t vsuml = vld1q_u32(prevSum + j);
uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
vsuml = vaddq_u32(vsuml, prev);
vsumh = vaddq_u32(vsumh, prev);
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
vsumh = vaddw_u16(vsumh, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]) + prevSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sumBase;
(void)sumStride;
#endif
}
void sqrIntegral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sqsumBase, ptrdiff_t sqsumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint16x8_t v_zero8 = vmovq_n_u16(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
double prev = 0.;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
prev = 0.;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sqsumBase;
(void)sqsumStride;
#endif
}
} // namespace CAROTENE_NS

112
3rdparty/carotene/src/intrinsics.hpp vendored Normal file
View File

@ -0,0 +1,112 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_INTRINSICS_HPP
#define CAROTENE_INTRINSICS_HPP
#include <carotene/definitions.hpp>
#include <arm_neon.h>
namespace CAROTENE_NS { namespace internal {
/////////////// Custom NEON intrinsics ///////////////////
// calculate reciprocal value
inline float32x4_t vrecpq_f32(float32x4_t val)
{
float32x4_t reciprocal = vrecpeq_f32(val);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
return reciprocal;
}
inline float32x2_t vrecp_f32(float32x2_t val)
{
float32x2_t reciprocal = vrecpe_f32(val);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
return reciprocal;
}
// caclulate sqrt value
inline float32x4_t vrsqrtq_f32(float32x4_t val)
{
float32x4_t e = vrsqrteq_f32(val);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
return e;
}
inline float32x2_t vrsqrt_f32(float32x2_t val)
{
float32x2_t e = vrsqrte_f32(val);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
return e;
}
inline float32x4_t vsqrtq_f32(float32x4_t val)
{
return vrecpq_f32(vrsqrtq_f32(val));
}
inline float32x2_t vsqrt_f32(float32x2_t val)
{
return vrecp_f32(vrsqrt_f32(val));
}
// table lookup with the table in a 128-bit register
inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
{
#ifdef __aarch64__
// AArch64 supports this natively
return ::vqtbl1_u8(a, b);
#else
union { uint8x16_t v; uint8x8x2_t w; } u = { a };
return vtbl2_u8(u.w, b);
#endif
}
} }
#endif

713
3rdparty/carotene/src/laplacian.cpp vendored Normal file
View File

@ -0,0 +1,713 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
namespace CAROTENE_NS {
bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
#ifdef CAROTENE_NEON
const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
const uint16x8_t v_zero = vdupq_n_u16(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
uint8x8_t vsub;
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
s16 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border_x3;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
vsub = x1;
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u16(tprev, tcurr, 7);
t1 = tcurr;
t2 = vextq_u16(tcurr, tnext, 1);
// and add them
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
uint8x8_t it0 = vqmovun_s16(tt0);
vst1_u8(drow + x - 8, it0);
vsub = x1;
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue * 3;
else if (border == BORDER_MODE_REPLICATE)
nextx = srow2[x] + srow1[x] + srow0[x];
}
else
{
nextx = (srow2 ? srow2[x + 1] : borderValue) +
srow1[x + 1] +
(srow0 ? srow0[x + 1] : borderValue);
}
s32 val = (prevx + currx + nextx) - 9 * srow1[x];
drow[x] = internal::saturate_cast<u8>((s32)val);
// make shift
prevx = currx;
currx = nextx;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() &&
size.width >= 8 && size.height >= 1 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian1OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t t0, t2;
uint8x8_t xx0 = vmov_n_u8(0x0);
uint8x8_t xx1 = vmov_n_u8(0x0);
uint8x8_t xx2 = vmov_n_u8(0x0);
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
if(x) {
xx0 = xx1;
xx1 = xx2;
} else {
xx1 = x1;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
xx1 = vset_lane_u8(borderValue, x1, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
}
}
xx2 = x1;
if(x) {
tcurr = tnext;
}
tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
if(!x) {
tcurr = tnext;
continue;
}
t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx;
s16 prevx;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v1[0] : v1[x-1];
nextx = x == cols-1 ? v1[x] : v1[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v1[1] : v1[x-1];
nextx = x == cols-1 ? v1[x-1] : v1[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v1[x-1];
nextx = x == cols-1 ? borderValue : v1[x+1];
}
*(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian3OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tprev = vmovq_n_s16(0x0);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t tc = vmovq_n_s16(0x0);
int16x8_t t0, t2, tcnext;
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
if(x) {
tprev = tcurr;
tcurr = tnext;
}
tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
if(!x) {
tcurr = tnext;
tc = tcnext;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
}
continue;
}
t0 = vextq_s16(tprev, tcurr, 7);
t2 = vextq_s16(tcurr, tnext, 1);
t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
tc = tcnext;
t0 = vshlq_n_s16(t0, 1);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx, nextx2;
s16 prevx, prevx2;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v0[0] : v0[x-1];
prevx2 = x == 0 ? v2[0] : v2[x-1];
nextx = x == cols-1 ? v0[x] : v0[x+1];
nextx2 = x == cols-1 ? v2[x] : v2[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v0[1] : v0[x-1];
prevx2 = x == 0 ? v2[1] : v2[x-1];
nextx = x == cols-1 ? v0[x-1] : v0[x+1];
nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v0[x-1];
prevx2 = x == 0 ? borderValue : v2[x-1];
nextx = x == cols-1 ? borderValue : v0[x+1];
nextx2 = x == cols-1 ? borderValue : v2[x+1];
}
s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian5OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = 0;
const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v3 = 0;
const u8* v4 = 0;
// make border
if (border == BORDER_MODE_REPLICATE) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
} else if (border == BORDER_MODE_REFLECT) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
} else if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1)
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tnext, tc, t0;
int16x8_t tnext2, tnext3;
int16x8_t tnext1Old, tnext2Old, tnext3Old;
int16x8_t tnext4OldOldOld, tnext5OldOldOld;
int16x8_t tcurr1 = vmovq_n_s16(0x0);
int16x8_t tnext1 = vmovq_n_s16(0x0);
int16x8_t tprev1 = vmovq_n_s16(0x0);
int16x8_t tpprev1 = vmovq_n_s16(0x0);
int16x8_t tppprev1 = vmovq_n_s16(0x0);
int16x8_t tnext4Old = vmovq_n_s16(0x0);
int16x8_t tnext5Old = vmovq_n_s16(0x0);
int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
// do vertical convolution
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
internal::prefetch(v3 + x);
internal::prefetch(v4 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
uint8x8_t x3 = vld1_u8(v3 + x);
uint8x8_t x4 = vld1_u8(v4 + x);
if(x) {
tcurr1 = tnext1;
}
tnext4OldOldOld = tnext4Old;
tnext5OldOldOld = tnext5Old;
tnext1Old = tnext1OldOld;
tnext2Old = tnext2OldOld;
tnext3Old = tnext3OldOld;
tnext4Old = tnext4OldOld;
tnext5Old = tnext5OldOld;
tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
tnext3 = vshlq_n_s16(tnext3, 1);
tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
tnext2 = vsubq_s16(tc, tnext);
tnext1 = vaddq_s16(tnext3, tnext2);
// tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
tnext2 = vshlq_n_s16(tnext2, 1);
// tnext2 = 2*x4 - 4*x2 + 2*x0
tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
// tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
tnext1OldOld = tnext1;
tnext2OldOld = tnext2;
tnext3OldOld = tnext3;
tnext4OldOld = tnext2;
tnext5OldOld = tnext1;
if(x) {
tnext1 = vextq_s16(tnext1Old, tnext1, 2);
tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
tprev1 = tnext3Old;
if(x!=8) {
tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
}
}
if(!x) {
// make border
if (border == BORDER_MODE_REPLICATE) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT101) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
} else if (border == BORDER_MODE_CONSTANT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
}
tppprev1 = tprev1;
continue;
}
t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
t0 = vaddq_s16(t0, t0);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x >= cols - 1)
x = cols-2;
s16 pprevx = 0;
s16 prevx = 0;
s16 nextx = 0;
s16 nnextx = 0;
for( ; x < cols; x++ )
{
if (x == 0) {
// make border
if (border == BORDER_MODE_REPLICATE) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
prevx = 0;
}
} else if (x == 1) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
}
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else {
pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
}
s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
if (x == cols-1) {
// make border
if (border == BORDER_MODE_REPLICATE) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_REFLECT) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
} else if (border == BORDER_MODE_REFLECT101) {
nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
} else if (border == BORDER_MODE_CONSTANT) {
nextx = 0;
nnextx = 8 * borderValue;
}
} else if (x == cols-2) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
} else if (border == BORDER_MODE_REFLECT101) {
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_CONSTANT) {
nnextx = 8 * borderValue;
}
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
} else {
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
}
s16 res = pprevx + prevx + currx + nextx + nnextx;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

160
3rdparty/carotene/src/magnitude.cpp vendored Normal file
View File

@ -0,0 +1,160 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cmath>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct Magnitude
{
typedef s16 type;
void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
int16x8_t & v_dst) const
{
int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
v_src0_p = vget_high_s16(v_src0);
v_src1_p = vget_high_s16(v_src1);
float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
}
void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
int16x4_t & v_dst) const
{
float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
v_dst = vqmovn_s32(v_sqrt);
}
void operator() (const short * src0, const short * src1, short * dst) const
{
f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
}
};
struct MagnitudeF32
{
typedef f32 type;
void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
float32x4_t & v_dst) const
{
v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
}
void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
float32x2_t & v_dst) const
{
v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
}
};
} // namespace
#endif
void magnitude(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
Magnitude());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void magnitude(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
MagnitudeF32());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

163
3rdparty/carotene/src/meanstddev.cpp vendored Normal file
View File

@ -0,0 +1,163 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cmath>
namespace CAROTENE_NS {
void meanStdDev(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f64 fsum = 0.0f, fsqsum = 0.0f;
sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
void meanStdDev(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
f64 fsum = 0.0f, fsqsum = 0.0f;
f32 arsum[8];
uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw4)
{
size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
v_sum = v_zero;
v_sqsum = v_zero_f;
for ( ; j + 16 < blockSize ; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
// 0
uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
// 1
v_srclo = vmovl_u16(vget_low_u16(v_src1));
v_srchi = vmovl_u16(vget_high_u16(v_src1));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
v_srclo_f = vcvtq_f32_u32(v_srclo);
v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
}
for ( ; j < blockSize; j += 4)
{
uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
float32x4_t v_src_f = vcvtq_f32_u32(v_src);
v_sum = vaddq_u32(v_sum, v_src);
v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
}
vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
vst1q_f32(arsum + 4, v_sqsum);
fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
}
// collect a few last elements in the current row
for ( ; j < size.width; ++j)
{
f32 srcval = src[j];
fsum += srcval;
fsqsum += srcval * srcval;
}
}
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
} // namespace CAROTENE_NS

227
3rdparty/carotene/src/median_filter.cpp vendored Normal file
View File

@ -0,0 +1,227 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
/*
* The code here is based on the code in
* <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
* See also <http://ndevilla.free.fr/median/median/index.html>.
*/
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
{
u8 buf[16+8];
vst1q_u8(buf+cn, r);
for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
return vld1q_u8(buf);
}
uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
{
u8 buf[8+8];
vst1_u8(buf, r);
for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
return vld1_u8(buf+cn);
}
} // namespace
//o------^-------^-----------------------------o 0
// | |
//o--^---v---^---|-------^---------------------o 1
// | | | |
//o--v-------v---|-------|-^-------^-------^---o 2
// | | | | |
//o------^-------v-----^-|-|-------|-------|---o 3
// | | | | | |
//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
// | | | | | | |
//o--v-------v---^-|---|---v---|-------|-------o 5
// | | | | |
//o------^-------|-|---v-------|-------v-------o 6
// | | | |
//o--^---v---^---|-v-----------v---------------o 7
// | | |
//o--v-------v---v-----------------------------o 8
#define ELT(num, level) v ## num ## _lv ## level
#define PIX_SORT(a, alvl, b, blvl, newlvl) \
PIX_MIN(a, alvl, b, blvl, newlvl); \
PIX_MAX(a, alvl, b, blvl, newlvl);
#define SORT9 \
PIX_SORT(1, 00, 2, 00, 01); \
PIX_SORT(4, 00, 5, 00, 02); \
PIX_SORT(7, 00, 8, 00, 03); \
PIX_SORT(0, 00, 1, 01, 04); \
PIX_SORT(3, 00, 4, 02, 05); \
PIX_SORT(6, 00, 7, 03, 06); \
PIX_SORT(1, 04, 2, 01, 07); \
PIX_SORT(4, 05, 5, 02, 08); \
PIX_SORT(7, 06, 8, 03, 09); \
PIX_MAX (0, 04, 3, 05, 10); \
PIX_MIN (5, 08, 8, 09, 11); \
PIX_SORT(4, 08, 7, 09, 12); \
PIX_MAX (3, 10, 6, 06, 13); \
PIX_MAX (1, 07, 4, 12, 14); \
PIX_MIN (2, 07, 5, 11, 15); \
PIX_MIN (4, 14, 7, 12, 16); \
PIX_SORT(4, 16, 2, 15, 17); \
PIX_MAX (6, 13, 4, 17, 18); \
PIX_MIN (4, 18, 2, 17, 19);
#endif
bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
{
return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
}
void medianFilter3x3(const Size2D &size, u32 numChannels,
const u8 *srcBase, ptrdiff_t srcStride,
const Margin &srcMargin,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
#ifdef CAROTENE_NEON
u32 cn = numChannels;
size_t colsn = size.width * cn;
for (size_t i = 0; i < size.height; ++i) {
const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
{
uint8x16_t v3_lv00 = vld1q_u8(psrc0);
uint8x16_t v4_lv00 = vld1q_u8(psrc1);
uint8x16_t v5_lv00 = vld1q_u8(psrc2);
uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
goto medianBlur3x3_mainBody;
for (; j < colsn - 16; j += 16) {
internal::prefetch(psrc0 + j);
internal::prefetch(psrc1 + j);
internal::prefetch(psrc2 + j);
v0_lv00 = vld1q_u8(psrc0 + j - cn);
v1_lv00 = vld1q_u8(psrc1 + j - cn);
v2_lv00 = vld1q_u8(psrc2 + j - cn);
v3_lv00 = vld1q_u8(psrc0 + j);
v4_lv00 = vld1q_u8(psrc1 + j);
v5_lv00 = vld1q_u8(psrc2 + j);
v6_lv00 = vld1q_u8(psrc0 + j + cn);
v7_lv00 = vld1q_u8(psrc1 + j + cn);
v8_lv00 = vld1q_u8(psrc2 + j + cn);
medianBlur3x3_mainBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1q_u8(pdst + j, v4_lv19);
}
}
{
size_t k = colsn - 8;
uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
goto medianBlur3x3_tailBody;
for (; k >= j - 8; k -= 8) {
v0_lv00 = vld1_u8(psrc0 + k - cn);
v1_lv00 = vld1_u8(psrc1 + k - cn);
v2_lv00 = vld1_u8(psrc2 + k - cn);
v3_lv00 = vld1_u8(psrc0 + k);
v4_lv00 = vld1_u8(psrc1 + k);
v5_lv00 = vld1_u8(psrc2 + k);
v6_lv00 = vld1_u8(psrc0 + k + cn);
v7_lv00 = vld1_u8(psrc1 + k + cn);
v8_lv00 = vld1_u8(psrc2 + k + cn);
medianBlur3x3_tailBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1_u8(pdst + k, v4_lv19);
}
}
}
#else
(void)size;
(void)numChannels;
(void)srcBase;
(void)srcStride;
(void)srcMargin;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

139
3rdparty/carotene/src/min_max.cpp vendored Normal file
View File

@ -0,0 +1,139 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct Min
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vminq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmin(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::min(src0[0], src1[0]);
}
};
template <typename T>
struct Max
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vmaxq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmax(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::max(src0[0], src1[0]);
}
};
} // namespace
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, op<type>()); \
}
#else
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
IMPL_MINMAX(u8)
IMPL_MINMAX(s8)
IMPL_MINMAX(u16)
IMPL_MINMAX(s16)
IMPL_MINMAX(u32)
IMPL_MINMAX(s32)
IMPL_MINMAX(f32)
} // namespace CAROTENE_NS

1340
3rdparty/carotene/src/minmaxloc.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

728
3rdparty/carotene/src/morph.cpp vendored Normal file
View File

@ -0,0 +1,728 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <algorithm>
#include <limits>
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 16 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
#ifdef CAROTENE_NEON
namespace {
struct ErodeVecOp
{
ErodeVecOp():borderValue(0){}
ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::max();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vminq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmin_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::min(a, b);
}
u8 borderValue;
};
struct DilateVecOp
{
DilateVecOp():borderValue(0){}
DilateVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::min();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vmaxq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmax_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::max(a, b);
}
u8 borderValue;
};
template <typename VecOp>
void morph3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, const VecOp & vop)
{
u8 borderValue = vop.borderValue;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
const uint8x16_t v_zero = vdupq_n_u8(0);
const uint8x16_t v_border = vdupq_n_u8(borderValue);
uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
// perform vertical convolution
for ( ; x <= bwidth; x += 16)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
uint8x16_t x1 = vld1q_u8(srow1 + x);
uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 16 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = vop(srow1[x4],
vop(srow2 ? srow2[x4] : borderValue,
srow0 ? srow0[x4] : borderValue));
currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vop(vop(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u8(tprev, tcurr, 15);
t1 = tcurr;
t2 = vextq_u8(tcurr, tnext, 1);
// and add them
t0 = vop(t0, vop(t1, t2));
vst1q_u8(drow + x - 16, t0);
}
x -= 16;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue;
else if (border == BORDER_MODE_REPLICATE)
nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
}
else
nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
srow0 ? srow0[x + 1] : borderValue),
srow1[x + 1]);
drow[x] = vop(prevx, vop(currx, nextx));
// make shift
prevx = currx;
currx = nextx;
}
}
}
} // namespace
#endif
void erode3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, ErodeVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void dilate3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, DilateVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template<class VecUpdate>
void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
{
size_t i, j, k;
size_t width16 = (width & -16) * cn;
size_t width8 = (width & -8) * cn;
width *= cn;
if (ksize == 1)
{
for (i = 0; i < width; i++)
dst[i] = src[i];
return;
}
ksize = ksize*cn;
VecUpdate updateOp;
switch(cn)
{
case 1:
for (i = 0; i < width16; i += 16)
{
const u8* sptr = src + i;
uint8x16_t s = vld1q_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1q_u8(sptr + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
const u8* sptr = src + i;
uint8x8_t s = vld1_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1_u8(sptr + k));
vst1_u8(dst + i, s);
}
break;
default:
for (i = 0; i < width16; i += 16)
{
uint8x16_t s = vld1q_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1q_u8(src + i + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
uint8x8_t s = vld1_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1_u8(src + i + k));
vst1_u8(dst + i, s);
}
break;
}
ptrdiff_t i0 = i;
for( k = 0; k < (size_t)cn; k++, src++, dst++ )
{
for( i = i0; i <= width - cn*2; i += cn*2 )
{
const u8* s = src + i;
u8 m = s[cn];
for( j = cn*2; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = updateOp(m, s[0]);
dst[i+cn] = updateOp(m, s[j]);
}
for( ; i < width; i += cn )
{
const u8* s = src + i;
u8 m = s[0];
for( j = cn; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = m;
}
}
}
template<class VecUpdate>
void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
{
size_t i, k;
size_t width32 = width & -32;
VecUpdate updateOp;
uint8x16_t x0,x1,s0,s1;
if (ksize == 3)
{
for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
sptr = src[2] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[3] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
}
else if (ksize > 1)
for (; count > 1; count -= 2, dst += dststep*2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 2; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
for (; count > 0; count--, dst += dststep, src++)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[0] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 1; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
vst1q_u8(dst + i, s0);
vst1q_u8(dst + i + 16, s1);
}
for(; i < width; i++ )
{
u8 s = src[0][i];
for( k = 1; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = s;
}
}
}
template <class Op>
inline void morphology(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
//Temporary buffers common for all iterations
std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
u8* srcRow = &_srcRow[0];
size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
std::vector<u8*> _rows(bufRows);
u8** rows = &_rows[0];
// adjust swidthcn so that the used part of buffers stays compact in memory
ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
std::vector<u8> _ringBuf(swidthcn*bufRows+16);
u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
std::vector<ptrdiff_t> _borderTab(borderLength);
ptrdiff_t * borderTab = &_borderTab[0];
std::vector<u8> _constBorderValue;
std::vector<u8> _constBorderRow;
u8 * constBorderValue = NULL;
u8 * constBorderRow = NULL;
if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderValue.resize(borderLength);
constBorderValue = &_constBorderValue[0];
size_t i;
for(i = 0; i < cn; i++)
constBorderValue[i] = borderValues[i];
for(; i < borderLength; i++)
constBorderValue[i] = constBorderValue[i-cn];
if( columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
size_t N = (ssize.width + ksize.width - 1)*cn;
for( i = 0; i < N; i += borderLength )
{
size_t n = std::min( borderLength, N - i );
for(size_t j = 0; j < n; j++)
srcRow[i+j] = constBorderValue[j];
}
MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
}
}
Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
ssize.height + borderMargin.top + borderMargin.bottom);
ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
// recompute border tables
if( dx1 > 0 || dx2 > 0 )
{
if( rowBorderType == BORDER_MODE_CONSTANT )
{
memcpy( srcRow, &constBorderValue[0], dx1*cn );
memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
}
else
{
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
ptrdiff_t wholeWidth = wholeSize.width;
ptrdiff_t i, j;
for( i = 0; i < dx1; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[i*cn + j] = p0 + j;
}
for( i = 0; i < dx2; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[(i + dx1)*cn + j] = p0 + j;
}
}
}
ptrdiff_t startY, startY0, endY, rowCount;
startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
u8* dst = dstBase;
ptrdiff_t width = ssize.width, kwidth = ksize.width;
ptrdiff_t kheight = ksize.height, ay = anchorY;
ptrdiff_t width1 = ssize.width + kwidth - 1;
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
ptrdiff_t dy = 0, i = 0;
src -= xofs1*cn;
ptrdiff_t count = endY - startY;
rowCount = 0;
for(;; dst += dstStride*i, dy += i)
{
ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
dcount = std::min(dcount, count);
count -= dcount;
for( ; dcount-- > 0; src += srcStride )
{
ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
u8* brow = ringBuf + bi*swidthcn;
if( (size_t)(++rowCount) > bufRows )
{
--rowCount;
++startY;
}
memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
if( makeBorder )
{
for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
srcRow[i] = src[borderTab[i]];
for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
}
MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
}
ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
for( i = 0; i < max_i; i++ )
{
ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
wholeSize.height, columnBorderType);
if( srcY < 0 ) // can happen only with constant border type
rows[i] = constBorderRow;
else
{
if( srcY >= startY + rowCount )
break;
ptrdiff_t bi = (srcY - startY0) % bufRows;
rows[i] = ringBuf + bi*swidthcn;
}
}
if( i < kheight )
break;
i -= kheight - 1;
MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
}
}
} // namespace
#endif // CAROTENE_NEON
void erode(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
void dilate(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
} // namespace CAROTENE_NS

1572
3rdparty/carotene/src/mul.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

1310
3rdparty/carotene/src/norm.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

539
3rdparty/carotene/src/opticalflow.cpp vendored Normal file
View File

@ -0,0 +1,539 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <float.h> // For FLT_EPSILON
namespace CAROTENE_NS {
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
/*
* Pyramidal Lucas-Kanade Optical Flow level processing
*/
void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
const u8 *prevData, ptrdiff_t prevStride,
const s16 *prevDerivData, ptrdiff_t prevDerivStride,
const u8 *nextData, ptrdiff_t nextStride,
u32 ptCount,
const f32 *prevPts, f32 *nextPts,
u8 *status, f32 *err,
const Size2D &winSize,
u32 terminationCount, f64 terminationEpsilon,
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
f32 minEigThreshold)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;
s32 cn2 = cn*2;
std::vector<s16> _buf(winSize.total()*(cn + cn2));
s16* IWinBuf = &_buf[0];
s32 IWinBufStride = winSize.width*cn;
s16* derivIWinBuf = &_buf[winSize.total()*cn];
s32 derivIWinBufStride = winSize.width*cn2;
for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
{
f32 levscale = (1./(1 << level));
u32 ptref = ptidx << 1;
f32 prevPtX = prevPts[ptref+0]*levscale;
f32 prevPtY = prevPts[ptref+1]*levscale;
f32 nextPtX;
f32 nextPtY;
if( level == maxLevel )
{
if( useInitialFlow )
{
nextPtX = nextPts[ptref+0]*levscale;
nextPtY = nextPts[ptref+1]*levscale;
}
else
{
nextPtX = prevPtX;
nextPtY = prevPtY;
}
}
else
{
nextPtX = nextPts[ptref+0]*2.f;
nextPtY = nextPts[ptref+1]*2.f;
}
nextPts[ptref+0] = nextPtX;
nextPts[ptref+1] = nextPtY;
s32 iprevPtX, iprevPtY;
s32 inextPtX, inextPtY;
prevPtX -= halfWinX;
prevPtY -= halfWinY;
iprevPtX = floor(prevPtX);
iprevPtY = floor(prevPtY);
if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
{
if( level == 0 )
{
if( status )
status[ptidx] = false;
if( err )
err[ptidx] = 0;
}
continue;
}
f32 a = prevPtX - iprevPtX;
f32 b = prevPtY - iprevPtY;
const s32 W_BITS = 14, W_BITS1 = 14;
const f32 FLT_SCALE = 1.f/(1 << 20);
s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));
s32 iw10 = round((1.f - a)*b*(1 << W_BITS));
s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
s32 dstep = prevDerivStride/sizeof(s16);
f32 A11 = 0, A12 = 0, A22 = 0;
int16x4_t viw00 = vmov_n_s16((s16)iw00);
int16x4_t viw01 = vmov_n_s16((s16)iw01);
int16x4_t viw10 = vmov_n_s16((s16)iw10);
int16x4_t viw11 = vmov_n_s16((s16)iw11);
float32x4_t vA11 = vmovq_n_f32(0);
float32x4_t vA12 = vmovq_n_f32(0);
float32x4_t vA22 = vmovq_n_f32(0);
s32 wwcn = winSize.width*cn;
// extract the patch from the first image, compute covariation matrix of derivatives
s32 x = 0;
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;
const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;
s16* Iptr = IWinBuf + y*IWinBufStride;
s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
internal::prefetch(src + x + prevStride * 2, 0);
for(x = 0; x <= wwcn - 8; x += 8)
{
uint8x8_t vsrc00 = vld1_u8(src + x);
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));
}
for(; x <= wwcn - 4; x += 4)
{
uint8x8_t vsrc00 = vld1_u8(src + x);
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));
int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));
int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));
int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));
int32x4_t vsuml1 = vmull_s16(vs00, viw00);
int32x4_t vsuml2 = vmull_s16(vs01, viw01);
vsuml1 = vmlal_s16(vsuml1, vs10, viw10);
vsuml2 = vmlal_s16(vsuml2, vs11, viw11);
int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
vst1_s16(Iptr + x, vsumnl);
}
internal::prefetch(dsrc + dstep * 2, 0);
for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
{
#if __GNUC_MINOR__ < 0
__asm__ (
"vld2.16 {d0-d1}, [%[dsrc00]] \n\t"
"vld2.16 {d2-d3}, [%[dsrc10]] \n\t"
"vld2.16 {d4-d5}, [%[dsrc01]] \n\t"
"vld2.16 {d6-d7}, [%[dsrc11]] \n\t"
"vmull.s16 q4, d3, %P[viw10] \n\t"
"vmull.s16 q5, d0, %P[viw00] \n\t"
"vmlal.s16 q4, d7, %P[viw11] \n\t"
"vmlal.s16 q5, d4, %P[viw01] \n\t"
"vmlal.s16 q4, d1, %P[viw00] \n\t"
"vmlal.s16 q5, d2, %P[viw10] \n\t"
"vmlal.s16 q4, d5, %P[viw01] \n\t"
"vmlal.s16 q5, d6, %P[viw11] \n\t"
"vrshrn.s32 d13, q4, %[W_BITS1] \n\t"
"vrshrn.s32 d12, q5, %[W_BITS1] \n\t"
"vmull.s16 q3, d13, d13 \n\t"
"vmull.s16 q4, d12, d12 \n\t"
"vmull.s16 q5, d13, d12 \n\t"
"vcvt.f32.s32 q3, q3 \n\t"
"vcvt.f32.s32 q4, q4 \n\t"
"vcvt.f32.s32 q5, q5 \n\t"
"vadd.f32 %q[vA22], q3 \n\t"
"vadd.f32 %q[vA11], q4 \n\t"
"vadd.f32 %q[vA12], q5 \n\t"
"vst2.16 {d12-d13}, [%[out]] \n\t"
: [vA22] "=w" (vA22),
[vA11] "=w" (vA11),
[vA12] "=w" (vA12)
: "0" (vA22),
"1" (vA11),
"2" (vA12),
[out] "r" (dIptr),
[dsrc00] "r" (dsrc),
[dsrc10] "r" (dsrc + dstep),
[dsrc01] "r" (dsrc + cn2),
[dsrc11] "r" (dsrc + dstep + cn2),
[viw00] "w" (viw00),
[viw10] "w" (viw10),
[viw01] "w" (viw01),
[viw11] "w" (viw11),
[W_BITS1] "I" (W_BITS1)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
);
#else
int16x4x2_t vdsrc00 = vld2_s16(dsrc);
int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);
int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);
int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);
int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);
int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);
vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);
vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);
vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);
vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);
vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);
vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);
int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);
int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);
int32x4_t va22i = vmull_s16(vsumny, vsumny);
int32x4_t va11i = vmull_s16(vsumnx, vsumnx);
int32x4_t va12i = vmull_s16(vsumnx, vsumny);
float32x4_t va22f = vcvtq_f32_s32(va22i);
float32x4_t va11f = vcvtq_f32_s32(va11i);
float32x4_t va12f = vcvtq_f32_s32(va12i);
vA22 = vaddq_f32(vA22, va22f);
vA11 = vaddq_f32(vA11, va11f);
vA12 = vaddq_f32(vA12, va12f);
int16x4x2_t vsum;
vsum.val[0] = vsumnx;
vsum.val[1] = vsumny;
vst2_s16(dIptr, vsum);
#endif
}
for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )
{
s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);
s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
dsrc[dstep+cn2+1]*iw11, W_BITS1);
Iptr[x] = (s16)ival;
dIptr[0] = (s16)ixval;
dIptr[1] = (s16)iyval;
A11 += (f32)(ixval*ixval);
A12 += (f32)(ixval*iyval);
A22 += (f32)(iyval*iyval);
}
}
f32 A11buf[2], A12buf[2], A22buf[2];
vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));
vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));
vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));
A11 += A11buf[0] + A11buf[1];
A12 += A12buf[0] + A12buf[1];
A22 += A22buf[0] + A22buf[1];
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
f32 D = A11*A22 - A12*A12;
f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
4.f*A12*A12))/(2*winSize.width*winSize.height);
if( err && getMinEigenVals )
err[ptidx] = (f32)minEig;
if( minEig < minEigThreshold || D < FLT_EPSILON )
{
if( level == 0 && status )
status[ptidx] = false;
continue;
}
D = 1.f/D;
nextPtX -= halfWinX;
nextPtY -= halfWinY;
f32 prevDeltaX = 0;
f32 prevDeltaY = 0;
for(u32 j = 0; j < terminationCount; j++ )
{
inextPtX = floor(nextPtX);
inextPtY = floor(nextPtY);
if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
{
if( level == 0 && status )
status[ptidx] = false;
break;
}
a = nextPtX - inextPtX;
b = nextPtY - inextPtY;
iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
iw01 = round(a*(1.f - b)*(1 << W_BITS));
iw10 = round((1.f - a)*b*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
f32 b1 = 0, b2 = 0;
viw00 = vmov_n_s16((s16)iw00);
viw01 = vmov_n_s16((s16)iw01);
viw10 = vmov_n_s16((s16)iw10);
viw11 = vmov_n_s16((s16)iw11);
float32x4_t vb1 = vmovq_n_f32(0);
float32x4_t vb2 = vmovq_n_f32(0);
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;
const s16* Iptr = IWinBuf + y*IWinBufStride;
const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
x = 0;
internal::prefetch(Jptr, nextStride * 2);
internal::prefetch(Iptr, IWinBufStride/2);
internal::prefetch(dIptr, derivIWinBufStride/2);
for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )
{
uint8x8_t vj00 = vld1_u8(Jptr + x);
uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);
uint8x8_t vj01 = vld1_u8(Jptr + x + cn);
uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);
int16x8_t vI = vld1q_s16(Iptr + x);
int16x8x2_t vDerivI = vld2q_s16(dIptr);
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);
int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));
int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));
int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));
int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));
float32x4_t vb1f = vcvtq_f32_s32(vb1i);
float32x4_t vb2f = vcvtq_f32_s32(vb2i);
vb1 = vaddq_f32(vb1, vb1f);
vb2 = vaddq_f32(vb2, vb2f);
}
for( ; x < wwcn; x++, dIptr += 2 )
{
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
W_BITS1-5) - Iptr[x];
b1 += (f32)(diff*dIptr[0]);
b2 += (f32)(diff*dIptr[1]);
}
}
f32 bbuf[2];
float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));
vst1_f32(bbuf, vb);
b1 += bbuf[0];
b2 += bbuf[1];
b1 *= FLT_SCALE;
b2 *= FLT_SCALE;
f32 deltaX = (f32)((A12*b2 - A22*b1) * D);
f32 deltaY = (f32)((A12*b1 - A11*b2) * D);
nextPtX += deltaX;
nextPtY += deltaY;
nextPts[ptref+0] = nextPtX + halfWinX;
nextPts[ptref+1] = nextPtY + halfWinY;
if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )
break;
if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&
std::abs(deltaY + prevDeltaY) < 0.01 )
{
nextPts[ptref+0] -= deltaX*0.5f;
nextPts[ptref+1] -= deltaY*0.5f;
break;
}
prevDeltaX = deltaX;
prevDeltaY = deltaY;
}
if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
{
f32 nextPointX = nextPts[ptref+0] - halfWinX;
f32 nextPointY = nextPts[ptref+1] - halfWinY;
s32 inextPointX = floor(nextPointX);
s32 inextPointY = floor(nextPointY);
if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||
inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )
{
if( status )
status[ptidx] = false;
continue;
}
f32 aa = nextPointX - inextPointX;
f32 bb = nextPointY - inextPointY;
iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));
iw01 = round(aa*(1.f - bb)*(1 << W_BITS));
iw10 = round((1.f - aa)*bb*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
f32 errval = 0.f;
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;
const s16* Iptr = IWinBuf + y*IWinBufStride;
for( x = 0; x < wwcn; x++ )
{
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
W_BITS1-5) - Iptr[x];
errval += std::abs((f32)diff);
}
}
err[ptidx] = errval / (32*wwcn*winSize.height);
}
}
#else
(void)size;
(void)cn;
(void)prevData;
(void)prevStride;
(void)prevDerivData;
(void)prevDerivStride;
(void)nextData;
(void)nextStride;
(void)prevPts;
(void)nextPts;
(void)status;
(void)err;
(void)winSize;
(void)terminationCount;
(void)terminationEpsilon;
(void)level;
(void)maxLevel;
(void)useInitialFlow;
(void)getMinEigenVals;
(void)minEigThreshold;
(void)ptCount;
#endif
}
}//CAROTENE_NS

274
3rdparty/carotene/src/phase.cpp vendored Normal file
View File

@ -0,0 +1,274 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <cfloat>
#include <cmath>
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
#define FASTATAN2CONST(scale) \
f32 P1((f32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \
P3((f32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \
P5((f32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \
P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
A_90((f32)(90.f * scale)), \
A_180((f32)(180.f * scale)), \
A_360((f32)(360.f * scale)); \
float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
_90(vdupq_n_f32(A_90)), \
_180(vdupq_n_f32(A_180)), \
_360(vdupq_n_f32(A_360)), \
z(vdupq_n_f32(0.0f)), \
p1(vdupq_n_f32(P1)), \
p3(vdupq_n_f32(P3)), \
p5(vdupq_n_f32(P5)), \
p7(vdupq_n_f32(P7));
#define FASTATAN2SCALAR(y, x, a) \
{ \
f32 ax = std::abs(x), ay = std::abs(y); \
f32 c, c2; \
if (ax >= ay) \
{ \
c = ay / (ax + (float)DBL_EPSILON); \
c2 = c * c; \
a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
} \
else \
{ \
c = ax / (ay + (float)DBL_EPSILON); \
c2 = c * c; \
a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
} \
if (x < 0) \
a = A_180 - a; \
if (y < 0) \
a = A_360 - a; \
}
#define FASTATAN2VECTOR(v_y, v_x, a) \
{ \
float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
float32x4_t c2 = vmulq_f32(c, c); \
a = vmulq_f32(c2, p7); \
\
a = vmulq_f32(vaddq_f32(a, p5), c2); \
a = vmulq_f32(vaddq_f32(a, p3), c2); \
a = vmulq_f32(vaddq_f32(a, p1), c); \
\
a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
\
}
} // namespace
#endif
void phase(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
FASTATAN2CONST(256.0f / 360.0f)
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
float32x4_t v_05 = vdupq_n_f32(0.5f);
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
// 0
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4_t v_dst32f0;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
// 1
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
vmovn_u16(v_dst16s1)));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vld1q_s16(src1 + j);
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
float32x4_t v_dst32f0;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
vst1_u8(dst + j, vmovn_u16(v_dst));
}
for (; j < size.width; j++)
{
f32 x = src0[j], y = src1[j];
f32 a;
FASTATAN2SCALAR(y, x, a)
dst[j] = (u8)(s32)floor(a + 0.5f);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void phase(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
FASTATAN2CONST(scale)
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw8; j += 8)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
float32x4_t v_dst32f;
// 0
FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
vst1q_f32(dst + j, v_dst32f);
// 1
FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
vst1q_f32(dst + j + 4, v_dst32f);
}
if(j + 4 <= size.width)
{
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
float32x4_t v_dst32f;
FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
vst1q_f32(dst + j, v_dst32f);
j += 4;
}
for (; j < size.width; j++)
{
f32 a;
FASTATAN2SCALAR(src1[j], src0[j], a)
dst[j] = a;
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

1414
3rdparty/carotene/src/pyramid.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

460
3rdparty/carotene/src/reduce.cpp vendored Normal file
View File

@ -0,0 +1,460 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cstring>
namespace CAROTENE_NS {
void reduceColSum(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memset(dstBase, 0, size.width*sizeof(s32));
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
int32x4_t sll = vmovq_n_s32(0);
int32x4_t slh = vmovq_n_s32(0);
int32x4_t shl = vmovq_n_s32(0);
int32x4_t shh = vmovq_n_s32(0);
for (size_t h = 0; h < size.height; h += 256)
{
size_t lim = std::min(h + 256, size.height);
uint16x8_t sl = vmovq_n_u16(0);
uint16x8_t sh = vmovq_n_u16(0);
for (size_t k = h; k < lim; ++k, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v = vld1q_u8(src_address);
sl = vaddw_u8(sl, vget_low_u8(v));
sh = vaddw_u8(sh, vget_high_u8(v));
}
int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));
sll = vqaddq_s32(sll, vsll);
slh = vqaddq_s32(slh, vslh);
shl = vqaddq_s32(shl, vshl);
shh = vqaddq_s32(shh, vshh);
}
vst1q_s32(dstBase + i + 0, sll);
vst1q_s32(dstBase + i + 4, slh);
vst1q_s32(dstBase + i + 8, shl);
vst1q_s32(dstBase + i + 12, shh);
}
for(size_t h = 0; h < size.height; ++h)
{
for(size_t j = i ; j < size.width; j++ )
{
if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
dstBase[j] = 0x7fFFffFF;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMax(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width);
size_t i = 0;
for (; i + 16*4 <= size.width; i += 16*4)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address + 0);
uint8x16_t s2 = vld1q_u8(src_address + 16);
uint8x16_t s3 = vld1q_u8(src_address + 32);
uint8x16_t s4 = vld1q_u8(src_address + 48);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
internal::prefetch(src_address + srcStride, 32);
uint8x16_t v1 = vld1q_u8(src_address + 0);
uint8x16_t v2 = vld1q_u8(src_address + 16);
uint8x16_t v3 = vld1q_u8(src_address + 32);
uint8x16_t v4 = vld1q_u8(src_address + 48);
s1 = vmaxq_u8(s1, v1);
s2 = vmaxq_u8(s2, v2);
s3 = vmaxq_u8(s3, v3);
s4 = vmaxq_u8(s4, v4);
}
vst1q_u8(dstBase + i + 0, s1);
vst1q_u8(dstBase + i + 16, s2);
vst1q_u8(dstBase + i + 32, s3);
vst1q_u8(dstBase + i + 48, s4);
}
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v1 = vld1q_u8(src_address);
s1 = vmaxq_u8(s1, v1);
}
vst1q_u8(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMin(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width);
size_t i = 0;
for (; i + 16*4 <= size.width; i += 16*4)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address + 0);
uint8x16_t s2 = vld1q_u8(src_address + 16);
uint8x16_t s3 = vld1q_u8(src_address + 32);
uint8x16_t s4 = vld1q_u8(src_address + 48);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
internal::prefetch(src_address + srcStride, 32);
uint8x16_t v1 = vld1q_u8(src_address + 0);
uint8x16_t v2 = vld1q_u8(src_address + 16);
uint8x16_t v3 = vld1q_u8(src_address + 32);
uint8x16_t v4 = vld1q_u8(src_address + 48);
s1 = vminq_u8(s1, v1);
s2 = vminq_u8(s2, v2);
s3 = vminq_u8(s3, v3);
s4 = vminq_u8(s4, v4);
}
vst1q_u8(dstBase + i + 0, s1);
vst1q_u8(dstBase + i + 16, s2);
vst1q_u8(dstBase + i + 32, s3);
vst1q_u8(dstBase + i + 48, s4);
}
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v1 = vld1q_u8(src_address);
s1 = vminq_u8(s1, v1);
}
vst1q_u8(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColSum(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vaddq_f32(s1, v1);
s2 = vaddq_f32(s2, v2);
s3 = vaddq_f32(s3, v3);
s4 = vaddq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vaddq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
{
for(size_t j = i ; j < size.width; j++ )
{
dstBase[j] += srcBase[j + srcstep * h];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMax(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vmaxq_f32(s1, v1);
s2 = vmaxq_f32(s2, v2);
s3 = vmaxq_f32(s3, v3);
s4 = vmaxq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vmaxq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMin(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vminq_f32(s1, v1);
s2 = vminq_f32(s2, v2);
s3 = vminq_f32(s3, v3);
s4 = vminq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vminq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
} // namespace CAROTENE_NS

694
3rdparty/carotene/src/remap.cpp vendored Normal file
View File

@ -0,0 +1,694 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace internal {
void remapNearestNeighborReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride)
{
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
for (size_t x = 0; x < size.width; ++x)
{
dst_row[x] = srcBase[map_row[x]];
}
}
}
void remapNearestNeighborConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue)
{
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
for (size_t x = 0; x < size.width; ++x)
{
s32 src_idx = map_row[x];
dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
}
}
}
void remapLinearReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride)
{
int16x8_t v_zero16 = vdupq_n_s16(0);
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for ( ; x + 8 < size.width; x += 8)
{
int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
// first part
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
vget_low_s16(v_src00))), v_coeff.val[0]);
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
vget_low_s16(v_src10))), v_coeff.val[0]);
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
// second part
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
vget_high_s16(v_src00))), v_coeff.val[0]);
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
vget_high_s16(v_src10))), v_coeff.val[0]);
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
// store
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
}
for ( ; x < size.width; ++x)
{
s32 src00_index = map_row[(x << 2)];
s32 src10_index = map_row[(x << 2) + 2];
f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
srcBase[src00_index];
f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
srcBase[src10_index];
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
}
}
}
void remapLinearConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue)
{
int16x8_t v_zero16 = vdupq_n_s16(0);
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for ( ; x + 8 < size.width; x += 8)
{
int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
// first part
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
vget_low_s16(v_src00))), v_coeff.val[0]);
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
vget_low_s16(v_src10))), v_coeff.val[0]);
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
// second part
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
vget_high_s16(v_src00))), v_coeff.val[0]);
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
vget_high_s16(v_src10))), v_coeff.val[0]);
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
// store
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
}
for ( ; x < size.width; ++x)
{
s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
}
}
}
} // namespace internal
#endif // CAROTENE_NEON
bool isRemapNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isRemapLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * tableBase, ptrdiff_t tableStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
int32x2_t v_step2 = vdup_n_s32(srcStride);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
int32x2_t v_zero2 = vdup_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0;
for ( ; x + 8 <= blockWidth; x += 8)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x, v_dst_index);
v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x + 4, v_dst_index);
}
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x, v_dst_index);
}
for ( ; x + 2 <= blockWidth; x += 2)
{
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
vst1_s32(map_row + x, v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
int32x2_t v_m1_2 = vdup_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
float32x2_t v_zero2 = vdup_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0;
for ( ; x + 8 <= blockWidth; x += 8)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_dst_index);
v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x + 4, v_dst_index);
}
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_dst_index);
}
for ( ; x + 2 <= blockWidth; x += 2)
{
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
vst1_s32(map_row + x, v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)tableBase;
(void)tableStride;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void remapLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * tableBase, ptrdiff_t tableStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0;
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
f32 src_x_f = table_row[(x << 1) + 0];
f32 src_y_f = table_row[(x << 1) + 1];
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[x << 1] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0;
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
f32 src_x_f = table_row[(x << 1) + 0];
f32 src_y_f = table_row[(x << 1) + 1];
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1)] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)tableBase;
(void)tableStride;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

85
3rdparty/carotene/src/remap.hpp vendored Normal file
View File

@ -0,0 +1,85 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_REMAP_HPP
#define CAROTENE_SRC_REMAP_HPP
#include "common.hpp"
#include <cmath>
#ifdef CAROTENE_NEON
namespace CAROTENE_NS { namespace internal {
enum
{
BLOCK_SIZE = 32
};
void remapNearestNeighborReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride);
void remapNearestNeighborConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue);
void remapLinearReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride);
void remapLinearConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue);
} }
#endif // CAROTENE_NEON
#endif // CAROTENE_SRC_REMAP_HPP

2191
3rdparty/carotene/src/resize.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

199
3rdparty/carotene/src/saturate_cast.hpp vendored Normal file
View File

@ -0,0 +1,199 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SATURATE_CAST_HPP
#define CAROTENE_SATURATE_CAST_HPP
#include <algorithm>
#include <climits>
#include <cmath>
#if defined _MSC_VER && defined _M_ARM
# include <intrin.h>
#endif
#include <carotene/definitions.hpp>
#include <carotene/types.hpp>
namespace CAROTENE_NS { namespace internal {
#if defined _MSC_VER && defined _M_ARM
__declspec(naked) static void vcvtr_s32_f64_imp(f64 d)
{
(void)d;
__emit(0xEEBD); // vcvtr.s32.f64 s0, d0
__emit(0x0B40);
__emit(0xEE10); // vmov r0, s0
__emit(0x0A10);
__emit(0x4770); // bx lr
}
# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x);
# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x);
#elif defined CV_ICC || defined __GNUC__
# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__)
# define CAROTENE_ROUND_FLT(value) { \
register union { f32 f; s32 i; } result; \
asm ("ftosis %0, %1 \n" : "=w" (result.f) : "w" (value) ); \
return result.i; }
# define CAROTENE_ROUND_DBL(value) { \
register union {f32 f; s32 i;} __tegra_result; \
asm ( \
"ftosid %0, %P1\n" \
: "=w" (__tegra_result.f) \
: "w" (value) \
); \
return __tegra_result.i; \
}
# else
# define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value);
# define CAROTENE_ROUND_DBL(value) return (s32)lrint(value);
# endif
#endif
inline s32 round(f32 value)
{
#ifdef CAROTENE_ROUND_FLT
CAROTENE_ROUND_FLT(value)
#else
s32 intpart = (s32)(value);
f32 fractpart = value - intpart;
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
else
return intpart;
#endif
}
inline s32 round(f64 value)
{
#ifdef CAROTENE_ROUND_DBL
CAROTENE_ROUND_DBL(value)
#else
s32 intpart = (s32)(value);
f64 fractpart = value - intpart;
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
else
return intpart;
#endif
}
/////////////// saturate_cast (used in image & signal processing) ///////////////////
template<typename _Tp> inline _Tp saturate_cast(u8 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s8 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u16 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s16 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s64 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u64 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(f32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(f64 v) { return _Tp(v); }
template<> inline u8 saturate_cast<u8>(s8 v) { return (u8)std::max((s32)v, 0); }
template<> inline u8 saturate_cast<u8>(u16 v) { return (u8)std::min((u32)v, (u32)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(s32 v) { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline u8 saturate_cast<u8>(s16 v) { return saturate_cast<u8>((s32)v); }
template<> inline u8 saturate_cast<u8>(u32 v) { return (u8)std::min(v, (u32)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(s64 v) { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline u8 saturate_cast<u8>(u64 v) { return (u8)std::min(v, (u64)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(f32 v) { return saturate_cast<u8>(round(v)); }
template<> inline u8 saturate_cast<u8>(f64 v) { return saturate_cast<u8>(round(v)); }
template<> inline s8 saturate_cast<s8>(u8 v) { return (s8)std::min((s32)v, SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(u16 v) { return (s8)std::min((u32)v, (u32)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(s32 v) { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline s8 saturate_cast<s8>(s16 v) { return saturate_cast<s8>((s32)v); }
template<> inline s8 saturate_cast<s8>(u32 v) { return (s8)std::min(v, (u32)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(s64 v) { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline s8 saturate_cast<s8>(u64 v) { return (s8)std::min(v, (u64)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(f32 v) { return saturate_cast<s8>(round(v)); }
template<> inline s8 saturate_cast<s8>(f64 v) { return saturate_cast<s8>(round(v)); }
template<> inline u16 saturate_cast<u16>(s8 v) { return (u16)std::max((s32)v, 0); }
template<> inline u16 saturate_cast<u16>(s16 v) { return (u16)std::max((s32)v, 0); }
template<> inline u16 saturate_cast<u16>(s32 v) { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline u16 saturate_cast<u16>(u32 v) { return (u16)std::min(v, (u32)USHRT_MAX); }
template<> inline u16 saturate_cast<u16>(s64 v) { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline u16 saturate_cast<u16>(u64 v) { return (u16)std::min(v, (u64)USHRT_MAX); }
template<> inline u16 saturate_cast<u16>(f32 v) { return saturate_cast<u16>(round(v)); }
template<> inline u16 saturate_cast<u16>(f64 v) { return saturate_cast<u16>(round(v)); }
template<> inline s16 saturate_cast<s16>(u16 v) { return (s16)std::min((s32)v, SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(s32 v) { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline s16 saturate_cast<s16>(u32 v) { return (s16)std::min(v, (u32)SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(s64 v) { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline s16 saturate_cast<s16>(u64 v) { return (s16)std::min(v, (u64)SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(f32 v) { return saturate_cast<s16>(round(v)); }
template<> inline s16 saturate_cast<s16>(f64 v) { return saturate_cast<s16>(round(v)); }
template<> inline u32 saturate_cast<u32>(s8 v) { return (u32)std::max(v, (s8)0); }
template<> inline u32 saturate_cast<u32>(s16 v) { return (u32)std::max(v, (s16)0); }
template<> inline u32 saturate_cast<u32>(s32 v) { return (u32)std::max(v, (s32)0); }
template<> inline u32 saturate_cast<u32>(s64 v) { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
template<> inline u32 saturate_cast<u32>(u64 v) { return (u32)std::min(v, (u64)UINT_MAX); }
//OpenCV like f32/f64 -> u32 conversion
//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
template<> inline u32 saturate_cast<u32>(f32 v) { return round(v); }
template<> inline u32 saturate_cast<u32>(f64 v) { return round(v); }
//Negative clipping implementation
//template<> inline u32 saturate_cast<u32>(f32 v) { return saturate_cast<u32>(round(v)); }
//template<> inline u32 saturate_cast<u32>(f64 v) { return saturate_cast<u32>(round(v)); }
template<> inline s32 saturate_cast<s32>(u32 v) { return (s32)std::min(v, (u32)INT_MAX); }
template<> inline s32 saturate_cast<s32>(s64 v) { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
template<> inline s32 saturate_cast<s32>(u64 v) { return (s32)std::min(v, (u64)INT_MAX); }
template<> inline s32 saturate_cast<s32>(f32 v) { return round(v); }
template<> inline s32 saturate_cast<s32>(f64 v) { return round(v); }
template<> inline u64 saturate_cast<u64>(s8 v) { return (u64)std::max(v, (s8)0); }
template<> inline u64 saturate_cast<u64>(s16 v) { return (u64)std::max(v, (s16)0); }
template<> inline u64 saturate_cast<u64>(s32 v) { return (u64)std::max(v, (s32)0); }
template<> inline u64 saturate_cast<u64>(s64 v) { return (u64)std::max(v, (s64)0); }
template<> inline s64 saturate_cast<s64>(u64 v) { return (s64)std::min(v, (u64)LLONG_MAX); }
} }
#endif

219
3rdparty/carotene/src/scharr.cpp vendored Normal file
View File

@ -0,0 +1,219 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <vector>
#include "common.hpp"
namespace CAROTENE_NS {
bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
{
return (dx == 0 && dy == 1 &&
isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
(dx == 1 && dy == 0 &&
isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
}
void Scharr3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
#ifdef CAROTENE_NEON
static s16 dw[] = {3, 10, 3};
if (dy == 1)
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
3, 1, dw, 0,
border, borderValue, borderMargin);
else
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
1, 3, 0, dw,
border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
void ScharrDeriv(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t colsn = size.width*cn;
size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
std::vector<s16> _tempBuf((delta << 1) + 64);
s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
int16x8_t vc3 = vmovq_n_s16(3);
int16x8_t vc10 = vmovq_n_s16(10);
uint8x8_t v8c10 = vmov_n_u8(10);
for(size_t y = 0; y < size.height; y++ )
{
const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
// do vertical convolution
size_t x = 0;
for( ; x < roiw8; x += 8 )
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
#if __GNUC_MINOR__ < 7
__asm__ (
"vld1.8 {d0}, [%[src0]] \n\t"
"vld1.8 {d2}, [%[src2]] \n\t"
"vld1.8 {d1}, [%[src1]] \n\t"
"vaddl.u8 q2, d2, d0 \n\t"
"vmull.u8 q3, d1, %[vc10] \n\t"
"vsubl.u8 q4, d2, d0 \n\t"
"vmla.s16 q3, q2, %q[vc3] \n\t"
"vst1.16 {d8-d9}, [%[out1],:128] \n\t"
"vst1.16 {d6-d7}, [%[out0],:128] \n\t"
:
: [out0] "r" (trow0 + x),
[out1] "r" (trow1 + x),
[src0] "r" (srow0 + x),
[src1] "r" (srow1 + x),
[src2] "r" (srow2 + x),
[vc10] "w" (v8c10), [vc3] "w" (vc3)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
#else
uint8x8_t s0 = vld1_u8(srow0 + x);
uint8x8_t s1 = vld1_u8(srow1 + x);
uint8x8_t s2 = vld1_u8(srow2 + x);
int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
vst1q_s16(trow1 + x, t1);
vst1q_s16(trow0 + x, t0);
#endif
}
for( ; x < colsn; x++ )
{
trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
trow1[x] = (s16)(srow2[x] - srow0[x]);
}
// make border
size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
for( s32 k = 0; k < cn; k++ )
{
trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
}
// do horizontal convolution, interleave the results and store them to dst
x = 0;
for( ; x < roiw8; x += 8 )
{
#if __GNUC_MINOR__ < 6
__asm__ (
"vld1.16 {d4-d5}, [%[s2ptr]] \n\t"
"vld1.16 {d8-d9}, [%[s4ptr]] \n\t"
"vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t"
"vld1.16 {d0-d1}, [%[s0ptr]] \n\t"
"vld1.16 {d2-d3}, [%[s1ptr]] \n\t"
"vadd.i16 q7, q2, q4 \n\t"
"vmul.s16 q6, q3, %q[vc10] \n\t"
"vsub.s16 q5, q1, q0 \n\t"
"vmla.s16 q6, q7, %q[vc3] \n\t"
"vst2.16 {d10-d13}, [%[out]] \n\t"
:
: [out] "r" (drow + x * 2),
[s0ptr] "r" (trow0 + x - cn),
[s1ptr] "r" (trow0 + x + cn),
[s2ptr] "r" (trow1 + x - cn),
[s3ptr] "r" (trow1 + x),
[s4ptr] "r" (trow1 + x + cn),
[vc10] "w" (vc10), [vc3] "w" (vc3)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
#else
int16x8_t s0 = vld1q_s16(trow0 + x - cn);
int16x8_t s1 = vld1q_s16(trow0 + x + cn);
int16x8_t s2 = vld1q_s16(trow1 + x - cn);
int16x8_t s3 = vld1q_s16(trow1 + x);
int16x8_t s4 = vld1q_s16(trow1 + x + cn);
int16x8_t s3x10 = vmulq_s16(s3, vc10);
int16x8_t s24 = vaddq_s16(s2, s4);
int16x8x2_t vr;
vr.val[0] = vsubq_s16(s1, s0);
vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
vst2q_s16(drow + x*2, vr);
#endif //__GNUC_MINOR__ < 6
}
for( ; x < colsn; x++ )
{
drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
}
}
#else
(void)size;
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,109 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "separable_filter.hpp"
namespace CAROTENE_NS {
bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
{
return isSupportedConfiguration() &&
size.width >= 9 && size.height >= 1 &&
(size.height + borderMargin.top + borderMargin.bottom) >= 2 &&
(dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE );
}
void SeparableFilter3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin));
#ifdef CAROTENE_NEON
if(!((xw || rowFilter < 3) && (yw || colFilter < 3)))
std::abort();//Couldn't call generic filter without provided weights
typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t,
const s16*, const s16*, BORDER_MODE, u8, Margin);
static sepFilter3x3_8u16s_func quickFilters[4][4]=
{
/*d0y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_121>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_121>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_121>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_121>::process},
/*dy */{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_m101>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_m101>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_m101>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_m101>::process},
/*d2y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_1m21>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_1m21>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_1m21>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_1m21>::process},
/*dNy*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16Generic>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16Generic>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16Generic>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16Generic>::process}
};
quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride,
xw, yw, border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)xw;
(void)yw;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

1161
3rdparty/carotene/src/separable_filter.hpp vendored Normal file

File diff suppressed because it is too large Load Diff

317
3rdparty/carotene/src/sobel.cpp vendored Normal file
View File

@ -0,0 +1,317 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <vector>
#include "common.hpp"
namespace CAROTENE_NS {
bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border,
s32 dx, s32 dy, Margin borderMargin)
{
return dx < 3 && dx >= 0 &&
dy < 3 && dy >= 0 &&
(dx + dy) > 0 &&
isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin);
}
void Sobel3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin));
#ifdef CAROTENE_NEON
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
dx, dy, 0, 0,
borderType, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border,
s32 dx, s32 dy)
{
return isSupportedConfiguration() &&
dx < 3 && dx >= 0 &&
dy < 3 && dy >= 0 &&
(dx + dy) > 0 &&
size.width >= 4 && size.height >= 2 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE );
}
void Sobel3x3(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE borderType, f32 borderValue)
{
internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy));
#ifdef CAROTENE_NEON
std::vector<f32> _tmp;
f32 *tmp = 0;
if (borderType == BORDER_MODE_CONSTANT)
{
_tmp.assign(size.width + 2, borderValue);
tmp = &_tmp[1];
}
ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size
std::vector<f32> _tempBuf((delta << 1) + 64);
f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32);
for( size_t y = 0; y < size.height; y++ )
{
const f32* srow0;
const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y);
const f32* srow2;
f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0);
f32* drow1 = internal::getRowPtr(dstBase, dstStride, y);
if (borderType == BORDER_MODE_REFLECT101) {
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
} else if (borderType == BORDER_MODE_CONSTANT) {
srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
}
float32x4_t tprev = vmovq_n_f32(0.f);
float32x4_t tcurr = vmovq_n_f32(0.f);
float32x4_t tnext = vmovq_n_f32(0.f);
float32x4_t t0, t1, t2;
// do vertical convolution
size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4);
for( ; x <= bcolsn; x += 4 )
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
float32x4_t x0 = vld1q_f32(srow0 + x);
float32x4_t x1 = vld1q_f32(srow1 + x);
float32x4_t x2 = vld1q_f32(srow2 + x);
tprev = tcurr;
tcurr = tnext;
if(!dy)
{
tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0);
}
else if(dy == 2)
{
tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0));
}
else
{
tnext = vsubq_f32(x2, x0);
}
if(!x) {
tcurr = tnext;
// make border
if (borderType == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_f32(borderValue,tcurr, 3);
}
else if (borderType == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3);
}
else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
{
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3);
}
continue;
}
internal::prefetch(trow0 + x);
internal::prefetch(trow1 + x);
t0 = vextq_f32(tprev, tcurr, 3);
t1 = tcurr;
t2 = vextq_f32(tcurr, tnext, 1);
if(!dx)
{
t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2));
}
else if(dx == 2)
{
t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0));
}
else
{
t0 = vsubq_f32(t2, t0);
}
if(!(y%2))
{
vst1q_f32(trow0 + x - 4, t0);
}
else
{
vst1q_f32(trow1 + x - 4, t0);
}
}
x -= 4;
if(x == size.width){
x--;
}
f32 prevx = 0, rowx = 0, nextx = 0;
if(!dy)
{
prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 4*borderValue :
srow2[0] + 2*srow1[0] + srow0[0]) );
rowx = srow2[x] + 2*srow1[x] + srow0[x];
}
else if(dy == 2)
{
prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 0.f :
srow2[0] - 2*srow1[0] + srow0[0]) );
rowx = srow2[x] - 2*srow1[x] + srow0[x];
}
else
{
prevx = x > 0 ? srow2[x-1] - srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 0.f :
srow2[0] - srow0[0]) );
rowx = srow2[x] - srow0[x];
}
for( ; x < size.width; x++ )
{
if(x+1 == size.width) {
// make border
if (borderType == BORDER_MODE_CONSTANT)
{
if(!dy) {
nextx = 4*borderValue;
} else {
nextx = 0.f;
}
} else if (borderType == BORDER_MODE_REFLECT101)
{
if(!dy) {
nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1];
} else if(dy == 2) {
nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1];
} else {
nextx = srow2[x-1] - srow0[x-1];
}
} else {
if(!dy) {
nextx = srow2[x] + 2*srow1[x] + srow0[x];
} else if(dy == 2) {
nextx = srow2[x] - 2*srow1[x] + srow0[x];
} else {
nextx = srow2[x] - srow0[x];
}
}
} else {
if(!dy) {
nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1];
} else if(dy == 2) {
nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1];
} else {
nextx = srow2[x+1] - srow0[x+1];
}
}
f32 res;
if(dx==1) {
res = nextx - prevx;
} else if(!dx) {
res = prevx + 2*rowx + nextx;
} else {
res = prevx - 2*rowx + nextx;
}
if(!(y%2)) {
*(trow0+x) = res;
} else {
*(trow1+x) = res;
}
prevx = rowx;
rowx = nextx;
}
if(y>0) {
for(size_t x1 = 0; x1 < size.width; x1++ )
{
if(y%2)
*(drow + x1) = trow0[x1];
else
*(drow + x1) = trow1[x1];
}
}
if(y == size.height-1) {
for(size_t x1 = 0; x1 < size.width; x1++ )
{
if(!(y%2))
*(drow1 + x1) = trow0[x1];
else
*(drow1 + x1) = trow1[x1];
}
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

621
3rdparty/carotene/src/sub.cpp vendored Normal file
View File

@ -0,0 +1,621 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct SubWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vsubq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vsub(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] - (WT)src1[0]);
}
};
template <typename T, typename WT>
struct SubSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqsubq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqsub(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] - (WT)src1[0]);
}
};
} // namespace
#endif
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (s16)src0[j] - (s16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
f32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00), vget_low_u8(v_src10)));
int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
vst1q_f32(dst + j + 8, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01), vget_low_u8(v_src11)));
vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1));
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vs) )));
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) )));
}
for(; j < size.width; j++)
dst[j] = (f32)src0[j] - (f32)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
uint8x16_t v_src1 = vld1q_u8(src1 + j);
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
uint8x16_t v_src1 = vld1q_u8(src1 + j);
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

385
3rdparty/carotene/src/sum.cpp vendored Normal file
View File

@ -0,0 +1,385 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
bool isSumSupported(u32 channels)
{
return (channels && channels < 5);
}
void sum(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const ptrdiff_t width = size.width * channels;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
ptrdiff_t i = 0;
if (channels == 3)
{
uint32x4_t vs1231 = vdupq_n_u32(0);
uint32x4_t vs3123 = vdupq_n_u32(0);
uint32x4_t vs2312 = vdupq_n_u32(0);
for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3)
{
internal::prefetch(src + j + 24);
s1 = vaddw_u8(s1, vld1_u8(src + j + 0));
s2 = vaddw_u8(s2, vld1_u8(src + j + 8));
s3 = vaddw_u8(s3, vld1_u8(src + j + 16));
}
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
}
if (i <= width - 8*3)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3)
{
internal::prefetch(src + 24);
s1 = vaddw_u8(s1, vld1_u8(src + 0));
s2 = vaddw_u8(s2, vld1_u8(src + 8));
s3 = vaddw_u8(s3, vld1_u8(src + 16));
}
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
}
u32 sum[12];
vst1q_u32(sum+0, vs1231);
vst1q_u32(sum+4, vs2312);
vst1q_u32(sum+8, vs3123);
for (; i < width; i += 3, src += 3)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
sumdst[2] += src[2];
}
sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9];
sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10];
sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11];
}
else
{
uint32x4_t vs = vdupq_n_u32(0);
for (; i <= width - 257*8; i += 257*8, src += 257 * 8)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
for (int j = 8; j < 257 * 8; j += 8)
{
internal::prefetch(src + j);
s1 = vaddw_u8(s1, vld1_u8(src + j));
}
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
}
if (i < width - 7)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
for(i+=8,src+=8; i < width-7; i+=8,src+=8)
{
internal::prefetch(src);
s1 = vaddw_u8(s1, vld1_u8(src));
}
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
}
if (channels == 1)
{
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2));
u32 s0 = vget_lane_u32(vs1, 0);
for(; i < width; ++i,++src)
s0 += src[0];
sumdst[0] += s0;
}
else if (channels == 4)
{
vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst)));
for(; i < width; i+=4,src+=4)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
sumdst[2] += src[2];
sumdst[3] += src[3];
}
}
else//if (channels == 2)
{
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst)));
for(; i < width; i+=2,src+=2)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
}
}
}//channels != 3
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)channels;
#endif
}
void sum(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride,
f64 * sumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const ptrdiff_t width = size.width * channels;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
ptrdiff_t i = 0;
if (channels == 3)
{
float32x4_t vs1231 = vdupq_n_f32(0);
float32x4_t vs2312 = vdupq_n_f32(0);
float32x4_t vs3123 = vdupq_n_f32(0);
for(; i <= width-12; i += 12)
{
internal::prefetch(src + i + 12);
vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0));
vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4));
vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8));
}
f32 s[12];
vst1q_f32(s + 0, vs1231);
vst1q_f32(s + 4, vs2312);
vst1q_f32(s + 8, vs3123);
sumdst[0] += s[0] + s[3] + s[6] + s[9];
sumdst[1] += s[1] + s[4] + s[7] + s[10];
sumdst[2] += s[2] + s[5] + s[8] + s[11];
for( ; i < width; i+=3)
{
sumdst[0] += src[i];
sumdst[1] += src[i+1];
sumdst[2] += src[i+2];
}
}
else
{
float32x4_t vs = vdupq_n_f32(0);
for(; i <= width-4; i += 4)
{
internal::prefetch(src + i);
vs = vaddq_f32(vs, vld1q_f32(src+i));
}
if (channels == 1)
{
float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs));
f32 s[2];
vst1_f32(s, vs2);
sumdst[0] += s[0] + s[1];
for( ; i < width; i++)
sumdst[0] += src[i];
}
else if (channels == 4)
{
f32 s[4];
vst1q_f32(s, vs);
sumdst[0] += s[0];
sumdst[1] += s[1];
sumdst[2] += s[2];
sumdst[3] += s[3];
}
else//if (channels == 2)
{
float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs));
f32 s[2];
vst1_f32(s, vs2);
sumdst[0] += s[0];
sumdst[1] += s[1];
if(i < width)
{
sumdst[0] += src[i];
sumdst[1] += src[i+1];
}
}
}//channels != 3
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)channels;
#endif
}
bool isSqsumSupported(u32 channels)
{
return (channels && ((4/channels)*channels == 4));
}
void sqsum(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sumdst, f64 * sqsumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSqsumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width*channels))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width * channels;
size_t blockSize0 = 1 << 23;
size_t roiw8 = width & ~7;
uint32x4_t v_zero = vdupq_n_u32(0u);
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw8)
{
size_t blockSize = std::min(roiw8 - j, blockSize0) + j;
uint32x4_t v_sum = v_zero;
uint32x4_t v_sqsum = v_zero;
for ( ; j < blockSize ; j += 8, src += 8)
{
internal::prefetch(src);
uint8x8_t v_src0 = vld1_u8(src);
uint16x8_t v_src = vmovl_u8(v_src0);
uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src);
v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi));
v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo);
v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi);
}
u32 arsum[8];
vst1q_u32(arsum, v_sum);
vst1q_u32(arsum + 4, v_sqsum);
sumdst[0] += (f64)arsum[0];
sumdst[1 % channels] += (f64)arsum[1];
sumdst[2 % channels] += (f64)arsum[2];
sumdst[3 % channels] += (f64)arsum[3];
sqsumdst[0] += (f64)arsum[4];
sqsumdst[1 % channels] += (f64)arsum[5];
sqsumdst[2 % channels] += (f64)arsum[6];
sqsumdst[3 % channels] += (f64)arsum[7];
}
// collect a few last elements in the current row
// it's ok to process channels elements per step
// since we could handle 1,2 or 4 channels
// we always have channels-fold amount of elements remaining
for ( ; j < width; j+=channels, src+=channels)
{
for (u32 kk = 0; kk < channels; kk++)
{
u32 srcval = src[kk];
sumdst[kk] += srcval;
sqsumdst[kk] += srcval * srcval;
}
}
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)sqsumdst;
(void)channels;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,241 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than
//time for simultaneous 2 lines matching for the same amount of data
bool isMatchTemplateSupported(const Size2D &tmplSize)
{
return isSupportedConfiguration() &&
tmplSize.width >= 8 && // Actually the function could process even shorter templates
// but there will be no NEON optimization in this case
(tmplSize.width * tmplSize.height) <= 256;
}
void matchTemplate(const Size2D &srcSize,
const u8 * srcBase, ptrdiff_t srcStride,
const Size2D &tmplSize,
const u8 * tmplBase, ptrdiff_t tmplStride,
f32 * dstBase, ptrdiff_t dstStride,
bool normalize)
{
internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize));
#ifdef CAROTENE_NEON
const size_t tmplW = tmplSize.width;
const size_t tmplH = tmplSize.height;
const size_t dstW = srcSize.width - tmplSize.width + 1;
const size_t dstH = srcSize.height - tmplSize.height + 1;
//template correlation part
{
#if ENABLE4LINESMATCHING
const size_t dstroiw4 = dstW & ~3u;
#endif
const size_t dstroiw2 = dstW & ~1u;
const size_t tmplroiw = tmplW & ~7u;
const size_t dstride = dstStride >> 2;
f32 *corr = dstBase;
const u8 *imgrrow = srcBase;
for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
{
size_t c = 0;
#if ENABLE4LINESMATCHING
for(; c < dstroiw4; c+=4)
{
u32 dot[4] = {0, 0, 0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
uint32x4_t vdot2 = vmovq_n_u32(0);
uint32x4_t vdot3 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
uint8x8_t vimg3 = vld1_u8(img + j + c + 3);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
vdot2 = vpadalq_u16(vdot2, vd2);
vdot3 = vpadalq_u16(vdot3, vd3);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
dot[2] += tmpl[j] * img[j + c + 2];
dot[3] += tmpl[j] * img[j + c + 3];
}
}
uint32x4_t vdotx = vld1q_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_2 = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
uint32x2_t vdot_3 = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);
vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
}
#endif
for(; c < dstroiw2; c+=2)
{
u32 dot[2] = {0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
}
}
uint32x2_t vdotx = vld1_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_ = vpadd_u32(vdot_0, vdot_1);
vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
}
for(; c < dstW; ++c)
{
u32 dot = 0;
uint32x4_t vdot = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg = vld1_u8(img + j + c);
uint16x8_t vd = vmull_u8(vtmpl, vimg);
vdot = vpadalq_u16(vdot, vd);
}
for(; j < tmplW; ++j)
dot += tmpl[j] * img[j + c];
}
u32 wdot[2];
vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
dot += wdot[0] + wdot[1];
corr[c] = (f32)dot;
}
}
}
if(normalize)
{
f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
size_t iw = srcSize.width+1;
size_t ih = srcSize.height+1;
std::vector<f64> _sqsum(iw*ih);
f64 *sqsum = &_sqsum[0];
memset(sqsum, 0, iw*sizeof(f64));
for(size_t i = 1; i < ih; ++i)
sqsum[iw*i] = 0.;
sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));
for(size_t i = 0; i < dstH; ++i)
{
f32 *result = internal::getRowPtr(dstBase, dstStride, i);
for(size_t j = 0; j < dstW; ++j)
{
double s2 = sqsum[iw*i + j] +
sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
sqsum[iw*(i + tmplSize.height) + j] -
sqsum[iw*i + j + tmplSize.width];
result[j] /= tn * std::sqrt(s2);
}
}
}
#else
(void)srcSize;
(void)srcBase;
(void)srcStride;
(void)tmplBase;
(void)tmplStride;
(void)dstBase;
(void)dstStride;
(void)normalize;
#endif
}
} // namespace CAROTENE_NS

1627
3rdparty/carotene/src/threshold.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

689
3rdparty/carotene/src/vtransform.hpp vendored Normal file
View File

@ -0,0 +1,689 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_VTRANSFORM_HPP
#define CAROTENE_SRC_VTRANSFORM_HPP
#include "common.hpp"
#include <carotene/types.hpp>
#ifdef CAROTENE_NEON
namespace CAROTENE_NS { namespace internal {
////////////////////////////// Type Traits ///////////////////////
template <typename T, int cn = 1>
struct VecTraits;
template <> struct VecTraits< u8, 1> { typedef uint8x16_t vec128; typedef uint8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
template <> struct VecTraits< s8, 1> { typedef int8x16_t vec128; typedef int8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
template <> struct VecTraits<u16, 1> { typedef uint16x8_t vec128; typedef uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s16, 1> { typedef int16x8_t vec128; typedef int16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s32, 1> { typedef int32x4_t vec128; typedef int32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<u32, 1> { typedef uint32x4_t vec128; typedef uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<s64, 1> { typedef int64x2_t vec128; typedef int64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<u64, 1> { typedef uint64x2_t vec128; typedef uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<f32, 1> { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits< u8, 2> { typedef uint8x16x2_t vec128; typedef uint8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
template <> struct VecTraits< s8, 2> { typedef int8x16x2_t vec128; typedef int8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
template <> struct VecTraits<u16, 2> { typedef uint16x8x2_t vec128; typedef uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s16, 2> { typedef int16x8x2_t vec128; typedef int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s32, 2> { typedef int32x4x2_t vec128; typedef int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<u32, 2> { typedef uint32x4x2_t vec128; typedef uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<s64, 2> { typedef int64x2x2_t vec128; typedef int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 2> { typedef uint64x2x2_t vec128; typedef uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 2> { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits< u8, 3> { typedef uint8x16x3_t vec128; typedef uint8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits< s8, 3> { typedef int8x16x3_t vec128; typedef int8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits<u16, 3> { typedef uint16x8x3_t vec128; typedef uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 3> { typedef int16x8x3_t vec128; typedef int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 3> { typedef int32x4x3_t vec128; typedef int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 3> { typedef uint32x4x3_t vec128; typedef uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 3> { typedef int64x2x3_t vec128; typedef int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 3> { typedef uint64x2x3_t vec128; typedef uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 3> { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits< u8, 4> { typedef uint8x16x4_t vec128; typedef uint8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits< s8, 4> { typedef int8x16x4_t vec128; typedef int8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits<u16, 4> { typedef uint16x8x4_t vec128; typedef uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 4> { typedef int16x8x4_t vec128; typedef int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 4> { typedef int32x4x4_t vec128; typedef int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 4> { typedef uint32x4x4_t vec128; typedef uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 4> { typedef int64x2x4_t vec128; typedef int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 4> { typedef uint64x2x4_t vec128; typedef uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 4> { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
////////////////////////////// vld1q ///////////////////////
inline uint8x16_t vld1q(const u8 * ptr) { return vld1q_u8(ptr); }
inline int8x16_t vld1q(const s8 * ptr) { return vld1q_s8(ptr); }
inline uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }
inline int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); }
inline uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }
inline int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); }
////////////////////////////// vld1 ///////////////////////
inline uint8x8_t vld1(const u8 * ptr) { return vld1_u8(ptr); }
inline int8x8_t vld1(const s8 * ptr) { return vld1_s8(ptr); }
inline uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); }
inline int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); }
inline uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); }
inline int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }
////////////////////////////// vld2q ///////////////////////
inline uint8x16x2_t vld2q(const u8 * ptr) { return vld2q_u8(ptr); }
inline int8x16x2_t vld2q(const s8 * ptr) { return vld2q_s8(ptr); }
inline uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); }
inline int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); }
inline uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); }
inline int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); }
inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); }
////////////////////////////// vld2 ///////////////////////
inline uint8x8x2_t vld2(const u8 * ptr) { return vld2_u8(ptr); }
inline int8x8x2_t vld2(const s8 * ptr) { return vld2_s8(ptr); }
inline uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); }
inline int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
inline uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); }
inline int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); }
inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); }
////////////////////////////// vld3q ///////////////////////
inline uint8x16x3_t vld3q(const u8 * ptr) { return vld3q_u8(ptr); }
inline int8x16x3_t vld3q(const s8 * ptr) { return vld3q_s8(ptr); }
inline uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); }
inline int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); }
inline uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); }
inline int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); }
inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); }
////////////////////////////// vld3 ///////////////////////
inline uint8x8x3_t vld3(const u8 * ptr) { return vld3_u8(ptr); }
inline int8x8x3_t vld3(const s8 * ptr) { return vld3_s8(ptr); }
inline uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); }
inline int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); }
inline uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); }
inline int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); }
inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); }
////////////////////////////// vld4q ///////////////////////
inline uint8x16x4_t vld4q(const u8 * ptr) { return vld4q_u8(ptr); }
inline int8x16x4_t vld4q(const s8 * ptr) { return vld4q_s8(ptr); }
inline uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); }
inline int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); }
inline uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); }
inline int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); }
inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); }
////////////////////////////// vld4 ///////////////////////
inline uint8x8x4_t vld4(const u8 * ptr) { return vld4_u8(ptr); }
inline int8x8x4_t vld4(const s8 * ptr) { return vld4_s8(ptr); }
inline uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); }
inline int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); }
inline uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); }
inline int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); }
inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); }
////////////////////////////// vst1q ///////////////////////
inline void vst1q(u8 * ptr, const uint8x16_t & v) { return vst1q_u8(ptr, v); }
inline void vst1q(s8 * ptr, const int8x16_t & v) { return vst1q_s8(ptr, v); }
inline void vst1q(u16 * ptr, const uint16x8_t & v) { return vst1q_u16(ptr, v); }
inline void vst1q(s16 * ptr, const int16x8_t & v) { return vst1q_s16(ptr, v); }
inline void vst1q(u32 * ptr, const uint32x4_t & v) { return vst1q_u32(ptr, v); }
inline void vst1q(s32 * ptr, const int32x4_t & v) { return vst1q_s32(ptr, v); }
inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
////////////////////////////// vst1 ///////////////////////
inline void vst1(u8 * ptr, const uint8x8_t & v) { return vst1_u8(ptr, v); }
inline void vst1(s8 * ptr, const int8x8_t & v) { return vst1_s8(ptr, v); }
inline void vst1(u16 * ptr, const uint16x4_t & v) { return vst1_u16(ptr, v); }
inline void vst1(s16 * ptr, const int16x4_t & v) { return vst1_s16(ptr, v); }
inline void vst1(u32 * ptr, const uint32x2_t & v) { return vst1_u32(ptr, v); }
inline void vst1(s32 * ptr, const int32x2_t & v) { return vst1_s32(ptr, v); }
inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }
////////////////////////////// vst2q ///////////////////////
inline void vst2q(u8 * ptr, const uint8x16x2_t & v) { return vst2q_u8(ptr, v); }
inline void vst2q(s8 * ptr, const int8x16x2_t & v) { return vst2q_s8(ptr, v); }
inline void vst2q(u16 * ptr, const uint16x8x2_t & v) { return vst2q_u16(ptr, v); }
inline void vst2q(s16 * ptr, const int16x8x2_t & v) { return vst2q_s16(ptr, v); }
inline void vst2q(u32 * ptr, const uint32x4x2_t & v) { return vst2q_u32(ptr, v); }
inline void vst2q(s32 * ptr, const int32x4x2_t & v) { return vst2q_s32(ptr, v); }
inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); }
////////////////////////////// vst2 ///////////////////////
inline void vst2(u8 * ptr, const uint8x8x2_t & v) { return vst2_u8(ptr, v); }
inline void vst2(s8 * ptr, const int8x8x2_t & v) { return vst2_s8(ptr, v); }
inline void vst2(u16 * ptr, const uint16x4x2_t & v) { return vst2_u16(ptr, v); }
inline void vst2(s16 * ptr, const int16x4x2_t & v) { return vst2_s16(ptr, v); }
inline void vst2(u32 * ptr, const uint32x2x2_t & v) { return vst2_u32(ptr, v); }
inline void vst2(s32 * ptr, const int32x2x2_t & v) { return vst2_s32(ptr, v); }
inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); }
////////////////////////////// vst3q ///////////////////////
inline void vst3q(u8 * ptr, const uint8x16x3_t & v) { return vst3q_u8(ptr, v); }
inline void vst3q(s8 * ptr, const int8x16x3_t & v) { return vst3q_s8(ptr, v); }
inline void vst3q(u16 * ptr, const uint16x8x3_t & v) { return vst3q_u16(ptr, v); }
inline void vst3q(s16 * ptr, const int16x8x3_t & v) { return vst3q_s16(ptr, v); }
inline void vst3q(u32 * ptr, const uint32x4x3_t & v) { return vst3q_u32(ptr, v); }
inline void vst3q(s32 * ptr, const int32x4x3_t & v) { return vst3q_s32(ptr, v); }
inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); }
////////////////////////////// vst3 ///////////////////////
inline void vst3(u8 * ptr, const uint8x8x3_t & v) { return vst3_u8(ptr, v); }
inline void vst3(s8 * ptr, const int8x8x3_t & v) { return vst3_s8(ptr, v); }
inline void vst3(u16 * ptr, const uint16x4x3_t & v) { return vst3_u16(ptr, v); }
inline void vst3(s16 * ptr, const int16x4x3_t & v) { return vst3_s16(ptr, v); }
inline void vst3(u32 * ptr, const uint32x2x3_t & v) { return vst3_u32(ptr, v); }
inline void vst3(s32 * ptr, const int32x2x3_t & v) { return vst3_s32(ptr, v); }
inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); }
////////////////////////////// vst4q ///////////////////////
inline void vst4q(u8 * ptr, const uint8x16x4_t & v) { return vst4q_u8(ptr, v); }
inline void vst4q(s8 * ptr, const int8x16x4_t & v) { return vst4q_s8(ptr, v); }
inline void vst4q(u16 * ptr, const uint16x8x4_t & v) { return vst4q_u16(ptr, v); }
inline void vst4q(s16 * ptr, const int16x8x4_t & v) { return vst4q_s16(ptr, v); }
inline void vst4q(u32 * ptr, const uint32x4x4_t & v) { return vst4q_u32(ptr, v); }
inline void vst4q(s32 * ptr, const int32x4x4_t & v) { return vst4q_s32(ptr, v); }
inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); }
////////////////////////////// vst4 ///////////////////////
inline void vst4(u8 * ptr, const uint8x8x4_t & v) { return vst4_u8(ptr, v); }
inline void vst4(s8 * ptr, const int8x8x4_t & v) { return vst4_s8(ptr, v); }
inline void vst4(u16 * ptr, const uint16x4x4_t & v) { return vst4_u16(ptr, v); }
inline void vst4(s16 * ptr, const int16x4x4_t & v) { return vst4_s16(ptr, v); }
inline void vst4(u32 * ptr, const uint32x2x4_t & v) { return vst4_u32(ptr, v); }
inline void vst4(s32 * ptr, const int32x2x4_t & v) { return vst4_s32(ptr, v); }
inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); }
////////////////////////////// vabdq ///////////////////////
inline uint8x16_t vabdq(const uint8x16_t & v0, const uint8x16_t & v1) { return vabdq_u8 (v0, v1); }
inline int8x16_t vabdq(const int8x16_t & v0, const int8x16_t & v1) { return vabdq_s8 (v0, v1); }
inline uint16x8_t vabdq(const uint16x8_t & v0, const uint16x8_t & v1) { return vabdq_u16(v0, v1); }
inline int16x8_t vabdq(const int16x8_t & v0, const int16x8_t & v1) { return vabdq_s16(v0, v1); }
inline uint32x4_t vabdq(const uint32x4_t & v0, const uint32x4_t & v1) { return vabdq_u32(v0, v1); }
inline int32x4_t vabdq(const int32x4_t & v0, const int32x4_t & v1) { return vabdq_s32(v0, v1); }
inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); }
////////////////////////////// vabd ///////////////////////
inline uint8x8_t vabd(const uint8x8_t & v0, const uint8x8_t & v1) { return vabd_u8 (v0, v1); }
inline int8x8_t vabd(const int8x8_t & v0, const int8x8_t & v1) { return vabd_s8 (v0, v1); }
inline uint16x4_t vabd(const uint16x4_t & v0, const uint16x4_t & v1) { return vabd_u16(v0, v1); }
inline int16x4_t vabd(const int16x4_t & v0, const int16x4_t & v1) { return vabd_s16(v0, v1); }
inline uint32x2_t vabd(const uint32x2_t & v0, const uint32x2_t & v1) { return vabd_u32(v0, v1); }
inline int32x2_t vabd(const int32x2_t & v0, const int32x2_t & v1) { return vabd_s32(v0, v1); }
inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); }
////////////////////////////// vminq ///////////////////////
inline uint8x16_t vminq(const uint8x16_t & v0, const uint8x16_t & v1) { return vminq_u8 (v0, v1); }
inline int8x16_t vminq(const int8x16_t & v0, const int8x16_t & v1) { return vminq_s8 (v0, v1); }
inline uint16x8_t vminq(const uint16x8_t & v0, const uint16x8_t & v1) { return vminq_u16(v0, v1); }
inline int16x8_t vminq(const int16x8_t & v0, const int16x8_t & v1) { return vminq_s16(v0, v1); }
inline uint32x4_t vminq(const uint32x4_t & v0, const uint32x4_t & v1) { return vminq_u32(v0, v1); }
inline int32x4_t vminq(const int32x4_t & v0, const int32x4_t & v1) { return vminq_s32(v0, v1); }
inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
////////////////////////////// vmin ///////////////////////
inline uint8x8_t vmin(const uint8x8_t & v0, const uint8x8_t & v1) { return vmin_u8 (v0, v1); }
inline int8x8_t vmin(const int8x8_t & v0, const int8x8_t & v1) { return vmin_s8 (v0, v1); }
inline uint16x4_t vmin(const uint16x4_t & v0, const uint16x4_t & v1) { return vmin_u16(v0, v1); }
inline int16x4_t vmin(const int16x4_t & v0, const int16x4_t & v1) { return vmin_s16(v0, v1); }
inline uint32x2_t vmin(const uint32x2_t & v0, const uint32x2_t & v1) { return vmin_u32(v0, v1); }
inline int32x2_t vmin(const int32x2_t & v0, const int32x2_t & v1) { return vmin_s32(v0, v1); }
inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); }
////////////////////////////// vmaxq ///////////////////////
inline uint8x16_t vmaxq(const uint8x16_t & v0, const uint8x16_t & v1) { return vmaxq_u8 (v0, v1); }
inline int8x16_t vmaxq(const int8x16_t & v0, const int8x16_t & v1) { return vmaxq_s8 (v0, v1); }
inline uint16x8_t vmaxq(const uint16x8_t & v0, const uint16x8_t & v1) { return vmaxq_u16(v0, v1); }
inline int16x8_t vmaxq(const int16x8_t & v0, const int16x8_t & v1) { return vmaxq_s16(v0, v1); }
inline uint32x4_t vmaxq(const uint32x4_t & v0, const uint32x4_t & v1) { return vmaxq_u32(v0, v1); }
inline int32x4_t vmaxq(const int32x4_t & v0, const int32x4_t & v1) { return vmaxq_s32(v0, v1); }
inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }
////////////////////////////// vmax ///////////////////////
inline uint8x8_t vmax(const uint8x8_t & v0, const uint8x8_t & v1) { return vmax_u8 (v0, v1); }
inline int8x8_t vmax(const int8x8_t & v0, const int8x8_t & v1) { return vmax_s8 (v0, v1); }
inline uint16x4_t vmax(const uint16x4_t & v0, const uint16x4_t & v1) { return vmax_u16(v0, v1); }
inline int16x4_t vmax(const int16x4_t & v0, const int16x4_t & v1) { return vmax_s16(v0, v1); }
inline uint32x2_t vmax(const uint32x2_t & v0, const uint32x2_t & v1) { return vmax_u32(v0, v1); }
inline int32x2_t vmax(const int32x2_t & v0, const int32x2_t & v1) { return vmax_s32(v0, v1); }
inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }
////////////////////////////// vdupq_n ///////////////////////
inline uint8x16_t vdupq_n(const u8 & val) { return vdupq_n_u8(val); }
inline int8x16_t vdupq_n(const s8 & val) { return vdupq_n_s8(val); }
inline uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
inline int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); }
inline uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
inline int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); }
inline uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }
inline int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); }
inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); }
////////////////////////////// vdup_n ///////////////////////
inline uint8x8_t vdup_n(const u8 & val) { return vdup_n_u8(val); }
inline int8x8_t vdup_n(const s8 & val) { return vdup_n_s8(val); }
inline uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
inline int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
inline uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); }
inline int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
inline uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); }
inline int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); }
inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); }
////////////////////////////// vget_low ///////////////////////
inline uint8x8_t vget_low(const uint8x16_t & v) { return vget_low_u8 (v); }
inline int8x8_t vget_low(const int8x16_t & v) { return vget_low_s8 (v); }
inline uint16x4_t vget_low(const uint16x8_t & v) { return vget_low_u16(v); }
inline int16x4_t vget_low(const int16x8_t & v) { return vget_low_s16(v); }
inline uint32x2_t vget_low(const uint32x4_t & v) { return vget_low_u32(v); }
inline int32x2_t vget_low(const int32x4_t & v) { return vget_low_s32(v); }
inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); }
////////////////////////////// vget_high ///////////////////////
inline uint8x8_t vget_high(const uint8x16_t & v) { return vget_high_u8 (v); }
inline int8x8_t vget_high(const int8x16_t & v) { return vget_high_s8 (v); }
inline uint16x4_t vget_high(const uint16x8_t & v) { return vget_high_u16(v); }
inline int16x4_t vget_high(const int16x8_t & v) { return vget_high_s16(v); }
inline uint32x2_t vget_high(const uint32x4_t & v) { return vget_high_u32(v); }
inline int32x2_t vget_high(const int32x4_t & v) { return vget_high_s32(v); }
inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); }
////////////////////////////// vcombine ///////////////////////
inline uint8x16_t vcombine(const uint8x8_t & v0, const uint8x8_t & v1) { return vcombine_u8 (v0, v1); }
inline int8x16_t vcombine(const int8x8_t & v0, const int8x8_t & v1) { return vcombine_s8 (v0, v1); }
inline uint16x8_t vcombine(const uint16x4_t & v0, const uint16x4_t & v1) { return vcombine_u16(v0, v1); }
inline int16x8_t vcombine(const int16x4_t & v0, const int16x4_t & v1) { return vcombine_s16(v0, v1); }
inline uint32x4_t vcombine(const uint32x2_t & v0, const uint32x2_t & v1) { return vcombine_u32(v0, v1); }
inline int32x4_t vcombine(const int32x2_t & v0, const int32x2_t & v1) { return vcombine_s32(v0, v1); }
inline float32x4_t vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); }
////////////////////////////// vaddq ///////////////////////
inline uint8x16_t vaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vaddq_u8 (v0, v1); }
inline int8x16_t vaddq(const int8x16_t & v0, const int8x16_t & v1) { return vaddq_s8 (v0, v1); }
inline uint16x8_t vaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vaddq_u16(v0, v1); }
inline int16x8_t vaddq(const int16x8_t & v0, const int16x8_t & v1) { return vaddq_s16(v0, v1); }
inline uint32x4_t vaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vaddq_u32(v0, v1); }
inline int32x4_t vaddq(const int32x4_t & v0, const int32x4_t & v1) { return vaddq_s32(v0, v1); }
inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); }
////////////////////////////// vadd ///////////////////////
inline uint8x8_t vadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vadd_u8 (v0, v1); }
inline int8x8_t vadd(const int8x8_t & v0, const int8x8_t & v1) { return vadd_s8 (v0, v1); }
inline uint16x4_t vadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vadd_u16(v0, v1); }
inline int16x4_t vadd(const int16x4_t & v0, const int16x4_t & v1) { return vadd_s16(v0, v1); }
inline uint32x2_t vadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vadd_u32(v0, v1); }
inline int32x2_t vadd(const int32x2_t & v0, const int32x2_t & v1) { return vadd_s32(v0, v1); }
inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); }
////////////////////////////// vqaddq ///////////////////////
inline uint8x16_t vqaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqaddq_u8 (v0, v1); }
inline int8x16_t vqaddq(const int8x16_t & v0, const int8x16_t & v1) { return vqaddq_s8 (v0, v1); }
inline uint16x8_t vqaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqaddq_u16(v0, v1); }
inline int16x8_t vqaddq(const int16x8_t & v0, const int16x8_t & v1) { return vqaddq_s16(v0, v1); }
inline uint32x4_t vqaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqaddq_u32(v0, v1); }
inline int32x4_t vqaddq(const int32x4_t & v0, const int32x4_t & v1) { return vqaddq_s32(v0, v1); }
////////////////////////////// vqadd ///////////////////////
inline uint8x8_t vqadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vqadd_u8 (v0, v1); }
inline int8x8_t vqadd(const int8x8_t & v0, const int8x8_t & v1) { return vqadd_s8 (v0, v1); }
inline uint16x4_t vqadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vqadd_u16(v0, v1); }
inline int16x4_t vqadd(const int16x4_t & v0, const int16x4_t & v1) { return vqadd_s16(v0, v1); }
inline uint32x2_t vqadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vqadd_u32(v0, v1); }
inline int32x2_t vqadd(const int32x2_t & v0, const int32x2_t & v1) { return vqadd_s32(v0, v1); }
////////////////////////////// vsubq ///////////////////////
inline uint8x16_t vsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vsubq_u8 (v0, v1); }
inline int8x16_t vsubq(const int8x16_t & v0, const int8x16_t & v1) { return vsubq_s8 (v0, v1); }
inline uint16x8_t vsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vsubq_u16(v0, v1); }
inline int16x8_t vsubq(const int16x8_t & v0, const int16x8_t & v1) { return vsubq_s16(v0, v1); }
inline uint32x4_t vsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vsubq_u32(v0, v1); }
inline int32x4_t vsubq(const int32x4_t & v0, const int32x4_t & v1) { return vsubq_s32(v0, v1); }
inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); }
////////////////////////////// vsub ///////////////////////
inline uint8x8_t vsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vsub_u8 (v0, v1); }
inline int8x8_t vsub(const int8x8_t & v0, const int8x8_t & v1) { return vsub_s8 (v0, v1); }
inline uint16x4_t vsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vsub_u16(v0, v1); }
inline int16x4_t vsub(const int16x4_t & v0, const int16x4_t & v1) { return vsub_s16(v0, v1); }
inline uint32x2_t vsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vsub_u32(v0, v1); }
inline int32x2_t vsub(const int32x2_t & v0, const int32x2_t & v1) { return vsub_s32(v0, v1); }
inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); }
////////////////////////////// vqsubq ///////////////////////
inline uint8x16_t vqsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqsubq_u8 (v0, v1); }
inline int8x16_t vqsubq(const int8x16_t & v0, const int8x16_t & v1) { return vqsubq_s8 (v0, v1); }
inline uint16x8_t vqsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqsubq_u16(v0, v1); }
inline int16x8_t vqsubq(const int16x8_t & v0, const int16x8_t & v1) { return vqsubq_s16(v0, v1); }
inline uint32x4_t vqsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqsubq_u32(v0, v1); }
inline int32x4_t vqsubq(const int32x4_t & v0, const int32x4_t & v1) { return vqsubq_s32(v0, v1); }
inline uint64x2_t vqsubq(const uint64x2_t & v0, const uint64x2_t & v1) { return vqsubq_u64(v0, v1); }
inline int64x2_t vqsubq(const int64x2_t & v0, const int64x2_t & v1) { return vqsubq_s64(v0, v1); }
////////////////////////////// vqsub ///////////////////////
inline uint8x8_t vqsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vqsub_u8 (v0, v1); }
inline int8x8_t vqsub(const int8x8_t & v0, const int8x8_t & v1) { return vqsub_s8 (v0, v1); }
inline uint16x4_t vqsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vqsub_u16(v0, v1); }
inline int16x4_t vqsub(const int16x4_t & v0, const int16x4_t & v1) { return vqsub_s16(v0, v1); }
inline uint32x2_t vqsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vqsub_u32(v0, v1); }
inline int32x2_t vqsub(const int32x2_t & v0, const int32x2_t & v1) { return vqsub_s32(v0, v1); }
inline uint64x1_t vqsub(const uint64x1_t & v0, const uint64x1_t & v1) { return vqsub_u64(v0, v1); }
inline int64x1_t vqsub(const int64x1_t & v0, const int64x1_t & v1) { return vqsub_s64(v0, v1); }
////////////////////////////// vmull ///////////////////////
inline uint16x8_t vmull(const uint8x8_t & v0, const uint8x8_t & v1) { return vmull_u8 (v0, v1); }
inline int16x8_t vmull(const int8x8_t & v0, const int8x8_t & v1) { return vmull_s8 (v0, v1); }
inline uint32x4_t vmull(const uint16x4_t & v0, const uint16x4_t & v1) { return vmull_u16(v0, v1); }
inline int32x4_t vmull(const int16x4_t & v0, const int16x4_t & v1) { return vmull_s16(v0, v1); }
inline uint64x2_t vmull(const uint32x2_t & v0, const uint32x2_t & v1) { return vmull_u32(v0, v1); }
inline int64x2_t vmull(const int32x2_t & v0, const int32x2_t & v1) { return vmull_s32(v0, v1); }
////////////////////////////// vrev64q ///////////////////////
inline uint8x16_t vrev64q(const uint8x16_t & v) { return vrev64q_u8 (v); }
inline int8x16_t vrev64q(const int8x16_t & v) { return vrev64q_s8 (v); }
inline uint16x8_t vrev64q(const uint16x8_t & v) { return vrev64q_u16(v); }
inline int16x8_t vrev64q(const int16x8_t & v) { return vrev64q_s16(v); }
inline uint32x4_t vrev64q(const uint32x4_t & v) { return vrev64q_u32(v); }
inline int32x4_t vrev64q(const int32x4_t & v) { return vrev64q_s32(v); }
inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); }
////////////////////////////// vrev64 ///////////////////////
inline uint8x8_t vrev64(const uint8x8_t & v) { return vrev64_u8 (v); }
inline int8x8_t vrev64(const int8x8_t & v) { return vrev64_s8 (v); }
inline uint16x4_t vrev64(const uint16x4_t & v) { return vrev64_u16(v); }
inline int16x4_t vrev64(const int16x4_t & v) { return vrev64_s16(v); }
inline uint32x2_t vrev64(const uint32x2_t & v) { return vrev64_u32(v); }
inline int32x2_t vrev64(const int32x2_t & v) { return vrev64_s32(v); }
inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); }
////////////////////////////// vceqq ///////////////////////
inline uint8x16_t vceqq(const uint8x16_t & v0, const uint8x16_t & v1) { return vceqq_u8 (v0, v1); }
inline uint8x16_t vceqq(const int8x16_t & v0, const int8x16_t & v1) { return vceqq_s8 (v0, v1); }
inline uint16x8_t vceqq(const uint16x8_t & v0, const uint16x8_t & v1) { return vceqq_u16(v0, v1); }
inline uint16x8_t vceqq(const int16x8_t & v0, const int16x8_t & v1) { return vceqq_s16(v0, v1); }
inline uint32x4_t vceqq(const uint32x4_t & v0, const uint32x4_t & v1) { return vceqq_u32(v0, v1); }
inline uint32x4_t vceqq(const int32x4_t & v0, const int32x4_t & v1) { return vceqq_s32(v0, v1); }
inline uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }
////////////////////////////// vceq ///////////////////////
inline uint8x8_t vceq(const uint8x8_t & v0, const uint8x8_t & v1) { return vceq_u8 (v0, v1); }
inline uint8x8_t vceq(const int8x8_t & v0, const int8x8_t & v1) { return vceq_s8 (v0, v1); }
inline uint16x4_t vceq(const uint16x4_t & v0, const uint16x4_t & v1) { return vceq_u16(v0, v1); }
inline uint16x4_t vceq(const int16x4_t & v0, const int16x4_t & v1) { return vceq_s16(v0, v1); }
inline uint32x2_t vceq(const uint32x2_t & v0, const uint32x2_t & v1) { return vceq_u32(v0, v1); }
inline uint32x2_t vceq(const int32x2_t & v0, const int32x2_t & v1) { return vceq_s32(v0, v1); }
inline uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); }
////////////////////////////// vcgtq ///////////////////////
inline uint8x16_t vcgtq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgtq_u8 (v0, v1); }
inline uint8x16_t vcgtq(const int8x16_t & v0, const int8x16_t & v1) { return vcgtq_s8 (v0, v1); }
inline uint16x8_t vcgtq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgtq_u16(v0, v1); }
inline uint16x8_t vcgtq(const int16x8_t & v0, const int16x8_t & v1) { return vcgtq_s16(v0, v1); }
inline uint32x4_t vcgtq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgtq_u32(v0, v1); }
inline uint32x4_t vcgtq(const int32x4_t & v0, const int32x4_t & v1) { return vcgtq_s32(v0, v1); }
inline uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); }
////////////////////////////// vcgt ///////////////////////
inline uint8x8_t vcgt(const uint8x8_t & v0, const uint8x8_t & v1) { return vcgt_u8 (v0, v1); }
inline uint8x8_t vcgt(const int8x8_t & v0, const int8x8_t & v1) { return vcgt_s8 (v0, v1); }
inline uint16x4_t vcgt(const uint16x4_t & v0, const uint16x4_t & v1) { return vcgt_u16(v0, v1); }
inline uint16x4_t vcgt(const int16x4_t & v0, const int16x4_t & v1) { return vcgt_s16(v0, v1); }
inline uint32x2_t vcgt(const uint32x2_t & v0, const uint32x2_t & v1) { return vcgt_u32(v0, v1); }
inline uint32x2_t vcgt(const int32x2_t & v0, const int32x2_t & v1) { return vcgt_s32(v0, v1); }
inline uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); }
////////////////////////////// vcgeq ///////////////////////
inline uint8x16_t vcgeq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgeq_u8 (v0, v1); }
inline uint8x16_t vcgeq(const int8x16_t & v0, const int8x16_t & v1) { return vcgeq_s8 (v0, v1); }
inline uint16x8_t vcgeq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgeq_u16(v0, v1); }
inline uint16x8_t vcgeq(const int16x8_t & v0, const int16x8_t & v1) { return vcgeq_s16(v0, v1); }
inline uint32x4_t vcgeq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgeq_u32(v0, v1); }
inline uint32x4_t vcgeq(const int32x4_t & v0, const int32x4_t & v1) { return vcgeq_s32(v0, v1); }
inline uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); }
////////////////////////////// vcge ///////////////////////
inline uint8x8_t vcge(const uint8x8_t & v0, const uint8x8_t & v1) { return vcge_u8 (v0, v1); }
inline uint8x8_t vcge(const int8x8_t & v0, const int8x8_t & v1) { return vcge_s8 (v0, v1); }
inline uint16x4_t vcge(const uint16x4_t & v0, const uint16x4_t & v1) { return vcge_u16(v0, v1); }
inline uint16x4_t vcge(const int16x4_t & v0, const int16x4_t & v1) { return vcge_s16(v0, v1); }
inline uint32x2_t vcge(const uint32x2_t & v0, const uint32x2_t & v1) { return vcge_u32(v0, v1); }
inline uint32x2_t vcge(const int32x2_t & v0, const int32x2_t & v1) { return vcge_s32(v0, v1); }
inline uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); }
////////////////////////////// vandq ///////////////////////
inline uint8x16_t vandq(const uint8x16_t & v0, const uint8x16_t & v1) { return vandq_u8 (v0, v1); }
inline int8x16_t vandq(const int8x16_t & v0, const int8x16_t & v1) { return vandq_s8 (v0, v1); }
inline uint16x8_t vandq(const uint16x8_t & v0, const uint16x8_t & v1) { return vandq_u16(v0, v1); }
inline int16x8_t vandq(const int16x8_t & v0, const int16x8_t & v1) { return vandq_s16(v0, v1); }
inline uint32x4_t vandq(const uint32x4_t & v0, const uint32x4_t & v1) { return vandq_u32(v0, v1); }
inline int32x4_t vandq(const int32x4_t & v0, const int32x4_t & v1) { return vandq_s32(v0, v1); }
////////////////////////////// vand ///////////////////////
inline uint8x8_t vand(const uint8x8_t & v0, const uint8x8_t & v1) { return vand_u8 (v0, v1); }
inline int8x8_t vand(const int8x8_t & v0, const int8x8_t & v1) { return vand_s8 (v0, v1); }
inline uint16x4_t vand(const uint16x4_t & v0, const uint16x4_t & v1) { return vand_u16(v0, v1); }
inline int16x4_t vand(const int16x4_t & v0, const int16x4_t & v1) { return vand_s16(v0, v1); }
inline uint32x2_t vand(const uint32x2_t & v0, const uint32x2_t & v1) { return vand_u32(v0, v1); }
inline int32x2_t vand(const int32x2_t & v0, const int32x2_t & v1) { return vand_s32(v0, v1); }
////////////////////////////// vmovn ///////////////////////
inline uint8x8_t vmovn(const uint16x8_t & v) { return vmovn_u16(v); }
inline int8x8_t vmovn(const int16x8_t & v) { return vmovn_s16(v); }
inline uint16x4_t vmovn(const uint32x4_t & v) { return vmovn_u32(v); }
inline int16x4_t vmovn(const int32x4_t & v) { return vmovn_s32(v); }
inline uint32x2_t vmovn(const uint64x2_t & v) { return vmovn_u64(v); }
inline int32x2_t vmovn(const int64x2_t & v) { return vmovn_s64(v); }
////////////////////////////// vqmovn ///////////////////////
inline uint8x8_t vqmovn(const uint16x8_t & v) { return vqmovn_u16(v); }
inline int8x8_t vqmovn(const int16x8_t & v) { return vqmovn_s16(v); }
inline uint16x4_t vqmovn(const uint32x4_t & v) { return vqmovn_u32(v); }
inline int16x4_t vqmovn(const int32x4_t & v) { return vqmovn_s32(v); }
inline uint32x2_t vqmovn(const uint64x2_t & v) { return vqmovn_u64(v); }
inline int32x2_t vqmovn(const int64x2_t & v) { return vqmovn_s64(v); }
////////////////////////////// vmovl ///////////////////////
inline uint16x8_t vmovl(const uint8x8_t & v) { return vmovl_u8(v); }
inline int16x8_t vmovl(const int8x8_t & v) { return vmovl_s8(v); }
inline uint32x4_t vmovl(const uint16x4_t & v) { return vmovl_u16(v); }
inline int32x4_t vmovl(const int16x4_t & v) { return vmovl_s16(v); }
////////////////////////////// vmvnq ///////////////////////
inline uint8x16_t vmvnq(const uint8x16_t & v) { return vmvnq_u8 (v); }
inline int8x16_t vmvnq(const int8x16_t & v) { return vmvnq_s8 (v); }
inline uint16x8_t vmvnq(const uint16x8_t & v) { return vmvnq_u16(v); }
inline int16x8_t vmvnq(const int16x8_t & v) { return vmvnq_s16(v); }
inline uint32x4_t vmvnq(const uint32x4_t & v) { return vmvnq_u32(v); }
inline int32x4_t vmvnq(const int32x4_t & v) { return vmvnq_s32(v); }
////////////////////////////// vmvn ///////////////////////
inline uint8x8_t vmvn(const uint8x8_t & v) { return vmvn_u8 (v); }
inline int8x8_t vmvn(const int8x8_t & v) { return vmvn_s8 (v); }
inline uint16x4_t vmvn(const uint16x4_t & v) { return vmvn_u16(v); }
inline int16x4_t vmvn(const int16x4_t & v) { return vmvn_s16(v); }
inline uint32x2_t vmvn(const uint32x2_t & v) { return vmvn_u32(v); }
inline int32x2_t vmvn(const int32x2_t & v) { return vmvn_s32(v); }
////////////////////////////// vbicq ///////////////////////
inline uint8x16_t vbicq(const uint8x16_t & v0, const uint8x16_t & v1) { return vbicq_u8 (v0, v1); }
inline int8x16_t vbicq(const int8x16_t & v0, const int8x16_t & v1) { return vbicq_s8 (v0, v1); }
inline uint16x8_t vbicq(const uint16x8_t & v0, const uint16x8_t & v1) { return vbicq_u16(v0, v1); }
inline int16x8_t vbicq(const int16x8_t & v0, const int16x8_t & v1) { return vbicq_s16(v0, v1); }
inline uint32x4_t vbicq(const uint32x4_t & v0, const uint32x4_t & v1) { return vbicq_u32(v0, v1); }
inline int32x4_t vbicq(const int32x4_t & v0, const int32x4_t & v1) { return vbicq_s32(v0, v1); }
inline uint64x2_t vbicq(const uint64x2_t & v0, const uint64x2_t & v1) { return vbicq_u64(v0, v1); }
inline int64x2_t vbicq(const int64x2_t & v0, const int64x2_t & v1) { return vbicq_s64(v0, v1); }
////////////////////////////// vbic ///////////////////////
inline uint8x8_t vbic(const uint8x8_t & v0, const uint8x8_t & v1) { return vbic_u8 (v0, v1); }
inline int8x8_t vbic(const int8x8_t & v0, const int8x8_t & v1) { return vbic_s8 (v0, v1); }
inline uint16x4_t vbic(const uint16x4_t & v0, const uint16x4_t & v1) { return vbic_u16(v0, v1); }
inline int16x4_t vbic(const int16x4_t & v0, const int16x4_t & v1) { return vbic_s16(v0, v1); }
inline uint32x2_t vbic(const uint32x2_t & v0, const uint32x2_t & v1) { return vbic_u32(v0, v1); }
inline int32x2_t vbic(const int32x2_t & v0, const int32x2_t & v1) { return vbic_s32(v0, v1); }
inline uint64x1_t vbic(const uint64x1_t & v0, const uint64x1_t & v1) { return vbic_u64(v0, v1); }
inline int64x1_t vbic(const int64x1_t & v0, const int64x1_t & v1) { return vbic_s64(v0, v1); }
////////////////////////////// vtransform ///////////////////////
template <typename Op>
void vtransform(Size2D size,
const typename Op::type * src0Base, ptrdiff_t src0Stride,
const typename Op::type * src1Base, ptrdiff_t src1Stride,
typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op)
{
typedef typename Op::type type;
typedef typename VecTraits<type>::vec128 vec128;
typedef typename VecTraits<type>::vec64 vec64;
if (src0Stride == src1Stride && src0Stride == dstStride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
{
size.width *= size.height;
size.height = 1;
}
const size_t step_base = 32 / sizeof(type);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
const size_t step_tail = 8 / sizeof(type);
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t y = 0; y < size.height; ++y)
{
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for( ; x < roiw_base; x += step_base )
{
internal::prefetch(src0 + x);
internal::prefetch(src1 + x);
vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type));
vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type));
vec128 v_dst;
op(v_src00, v_src10, v_dst);
vst1q(dst + x, v_dst);
op(v_src01, v_src11, v_dst);
vst1q(dst + x + 16 / sizeof(type), v_dst);
}
for( ; x < roiw_tail; x += step_tail )
{
vec64 v_src0 = vld1(src0 + x);
vec64 v_src1 = vld1(src1 + x);
vec64 v_dst;
op(v_src0, v_src1, v_dst);
vst1(dst + x, v_dst);
}
for (; x < size.width; ++x)
{
op(src0 + x, src1 + x, dst + x);
}
}
}
} }
#endif // CAROTENE_NEON
#endif

434
3rdparty/carotene/src/warp_affine.cpp vendored Normal file
View File

@ -0,0 +1,434 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
bool isWarpAffineNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isWarpAffineLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void warpAffineLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,464 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isWarpPerspectiveLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
float32x4_t v_m6 = vdupq_n_f32(m[6]);
float32x4_t v_m7 = vdupq_n_f32(m[7]);
float32x4_t v_m8 = vdupq_n_f32(m[8]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
float32x4_t v_m6 = vdupq_n_f32(m[6]);
float32x4_t v_m7 = vdupq_n_f32(m[7]);
float32x4_t v_m8 = vdupq_n_f32(m[8]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

3
3rdparty/ffmpeg/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
downloads/
*.dll
ffmpeg_version.cmake

25
3rdparty/ffmpeg/ffmpeg.cmake vendored Normal file
View File

@ -0,0 +1,25 @@
# Binary branch name: ffmpeg/master_20160908
# Binaries were created for OpenCV: 11a65475d8d460a01c8818c5a2d0544ec49d7d68
set(FFMPEG_BINARIES_COMMIT "03835134465888981e066434dc95009e8328d4ea")
set(FFMPEG_FILE_HASH_BIN32 "32ba7790b0ac7a6dc66be91603637a7d")
set(FFMPEG_FILE_HASH_BIN64 "068ecaa459a5571e7909cff90999a420")
set(FFMPEG_FILE_HASH_CMAKE "f99941d10c1e87bf16b9055e8fc91ab2")
set(FFMPEG_DOWNLOAD_URL ${OPENCV_FFMPEG_URL};$ENV{OPENCV_FFMPEG_URL};https://raw.githubusercontent.com/opencv/opencv_3rdparty/${FFMPEG_BINARIES_COMMIT}/ffmpeg/)
ocv_download(PACKAGE opencv_ffmpeg.dll
HASH ${FFMPEG_FILE_HASH_BIN32}
URL ${FFMPEG_DOWNLOAD_URL}
DESTINATION_DIR ${CMAKE_CURRENT_LIST_DIR})
ocv_download(PACKAGE opencv_ffmpeg_64.dll
HASH ${FFMPEG_FILE_HASH_BIN64}
URL ${FFMPEG_DOWNLOAD_URL}
DESTINATION_DIR ${CMAKE_CURRENT_LIST_DIR})
ocv_download(PACKAGE ffmpeg_version.cmake
HASH ${FFMPEG_FILE_HASH_CMAKE}
URL ${FFMPEG_DOWNLOAD_URL}
DESTINATION_DIR ${CMAKE_CURRENT_LIST_DIR})
include(${CMAKE_CURRENT_LIST_DIR}/ffmpeg_version.cmake)

View File

@ -1,11 +0,0 @@
set(HAVE_FFMPEG 1)
set(HAVE_FFMPEG_CODEC 1)
set(HAVE_FFMPEG_FORMAT 1)
set(HAVE_FFMPEG_UTIL 1)
set(HAVE_FFMPEG_SWSCALE 1)
set(HAVE_GENTOO_FFMPEG 1)
set(ALIASOF_libavcodec_VERSION 55.18.102)
set(ALIASOF_libavformat_VERSION 55.12.100)
set(ALIASOF_libavutil_VERSION 52.38.100)
set(ALIASOF_libswscale_VERSION 2.3.100)

View File

@ -1 +0,0 @@
#include "cap_ffmpeg_impl.hpp"

520
3rdparty/ffmpeg/license.txt vendored Normal file
View File

@ -0,0 +1,520 @@
Copyright (C) 2001 Fabrice Bellard
FFmpeg is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
FFmpeg is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with FFmpeg; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
==================================================================================
GNU LESSER GENERAL PUBLIC LICENSE
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
[This is the first released version of the Lesser GPL. It also counts
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
Licenses are intended to guarantee your freedom to share and change
free software--to make sure the software is free for all its users.
This license, the Lesser General Public License, applies to some
specially designated software packages--typically libraries--of the
Free Software Foundation and other authors who decide to use it. You
can use it too, but we suggest you first think carefully about whether
this license or the ordinary General Public License is the better
strategy to use in any particular case, based on the explanations below.
When we speak of free software, we are referring to freedom of use,
not price. Our General Public Licenses are designed to make sure that
you have the freedom to distribute copies of free software (and charge
for this service if you wish); that you receive source code or can get
it if you want it; that you can change the software and use pieces of
it in new free programs; and that you are informed that you can do
these things.
To protect your rights, we need to make restrictions that forbid
distributors to deny you these rights or to ask you to surrender these
rights. These restrictions translate to certain responsibilities for
you if you distribute copies of the library or if you modify it.
For example, if you distribute copies of the library, whether gratis
or for a fee, you must give the recipients all the rights that we gave
you. You must make sure that they, too, receive or can get the source
code. If you link other code with the library, you must provide
complete object files to the recipients, so that they can relink them
with the library after making changes to the library and recompiling
it. And you must show them these terms so they know their rights.
We protect your rights with a two-step method: (1) we copyright the
library, and (2) we offer you this license, which gives you legal
permission to copy, distribute and/or modify the library.
To protect each distributor, we want to make it very clear that
there is no warranty for the free library. Also, if the library is
modified by someone else and passed on, the recipients should know
that what they have is not the original version, so that the original
author's reputation will not be affected by problems that might be
introduced by others.
Finally, software patents pose a constant threat to the existence of
any free program. We wish to make sure that a company cannot
effectively restrict the users of a free program by obtaining a
restrictive license from a patent holder. Therefore, we insist that
any patent license obtained for a version of the library must be
consistent with the full freedom of use specified in this license.
Most GNU software, including some libraries, is covered by the
ordinary GNU General Public License. This license, the GNU Lesser
General Public License, applies to certain designated libraries, and
is quite different from the ordinary General Public License. We use
this license for certain libraries in order to permit linking those
libraries into non-free programs.
When a program is linked with a library, whether statically or using
a shared library, the combination of the two is legally speaking a
combined work, a derivative of the original library. The ordinary
General Public License therefore permits such linking only if the
entire combination fits its criteria of freedom. The Lesser General
Public License permits more lax criteria for linking other code with
the library.
We call this license the "Lesser" General Public License because it
does Less to protect the user's freedom than the ordinary General
Public License. It also provides other free software developers Less
of an advantage over competing non-free programs. These disadvantages
are the reason we use the ordinary General Public License for many
libraries. However, the Lesser license provides advantages in certain
special circumstances.
For example, on rare occasions, there may be a special need to
encourage the widest possible use of a certain library, so that it becomes
a de-facto standard. To achieve this, non-free programs must be
allowed to use the library. A more frequent case is that a free
library does the same job as widely used non-free libraries. In this
case, there is little to gain by limiting the free library to free
software only, so we use the Lesser General Public License.
In other cases, permission to use a particular library in non-free
programs enables a greater number of people to use a large body of
free software. For example, permission to use the GNU C Library in
non-free programs enables many more people to use the whole GNU
operating system, as well as its variant, the GNU/Linux operating
system.
Although the Lesser General Public License is Less protective of the
users' freedom, it does ensure that the user of a program that is
linked with the Library has the freedom and the wherewithal to run
that program using a modified version of the Library.
The precise terms and conditions for copying, distribution and
modification follow. Pay close attention to the difference between a
"work based on the library" and a "work that uses the library". The
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".
A "library" means a collection of software functions and/or data
prepared so as to be conveniently linked with application programs
(which use some of those functions and data) to form executables.
The "Library", below, refers to any such software library or work
which has been distributed under these terms. A "work based on the
Library" means either the Library or any derivative work under
copyright law: that is to say, a work containing the Library or a
portion of it, either verbatim or with modifications and/or translated
straightforwardly into another language. (Hereinafter, translation is
included without limitation in the term "modification".)
"Source code" for a work means the preferred form of the work for
making modifications to it. For a library, complete source code means
all the source code for all modules it contains, plus any associated
interface definition files, plus the scripts used to control compilation
and installation of the library.
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running a program using the Library is not restricted, and output from
such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
appropriate copyright notice and disclaimer of warranty; keep intact
all the notices that refer to this License and to the absence of any
warranty; and distribute a copy of this License along with the
Library.
You may charge a fee for the physical act of transferring a copy,
and you may at your option offer warranty protection in exchange for a
fee.
2. You may modify your copy or copies of the Library or any portion
of it, thus forming a work based on the Library, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) The modified work must itself be a software library.
b) You must cause the files modified to carry prominent notices
stating that you changed the files and the date of any change.
c) You must cause the whole of the work to be licensed at no
charge to all third parties under the terms of this License.
d) If a facility in the modified Library refers to a function or a
table of data to be supplied by an application program that uses
the facility, other than as an argument passed when the facility
is invoked, then you must make a good faith effort to ensure that,
in the event an application does not supply such function or
table, the facility still operates, and performs whatever part of
its purpose remains meaningful.
(For example, a function in a library to compute square roots has
a purpose that is entirely well-defined independent of the
application. Therefore, Subsection 2d requires that any
application-supplied function or table used by this function must
be optional: if the application does not supply it, the square
root function must still compute square roots.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Library,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Library, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote
it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Library.
In addition, mere aggregation of another work not based on the Library
with the Library (or with a work based on the Library) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may opt to apply the terms of the ordinary GNU General Public
License instead of this License to a given copy of the Library. To do
this, you must alter all the notices that refer to this License, so
that they refer to the ordinary GNU General Public License, version 2,
instead of to this License. (If a newer version than version 2 of the
ordinary GNU General Public License has appeared, then you can specify
that version instead if you wish.) Do not make any other change in
these notices.
Once this change is made in a given copy, it is irreversible for
that copy, so the ordinary GNU General Public License applies to all
subsequent copies and derivative works made from that copy.
This option is useful when you wish to copy part of the code of
the Library into a program that is not a library.
4. You may copy and distribute the Library (or a portion or
derivative of it, under Section 2) in object code or executable form
under the terms of Sections 1 and 2 above provided that you accompany
it with the complete corresponding machine-readable source code, which
must be distributed under the terms of Sections 1 and 2 above on a
medium customarily used for software interchange.
If distribution of object code is made by offering access to copy
from a designated place, then offering equivalent access to copy the
source code from the same place satisfies the requirement to
distribute the source code, even though third parties are not
compelled to copy the source along with the object code.
5. A program that contains no derivative of any portion of the
Library, but is designed to work with the Library by being compiled or
linked with it, is called a "work that uses the Library". Such a
work, in isolation, is not a derivative work of the Library, and
therefore falls outside the scope of this License.
However, linking a "work that uses the Library" with the Library
creates an executable that is a derivative of the Library (because it
contains portions of the Library), rather than a "work that uses the
library". The executable is therefore covered by this License.
Section 6 states terms for distribution of such executables.
When a "work that uses the Library" uses material from a header file
that is part of the Library, the object code for the work may be a
derivative work of the Library even though the source code is not.
Whether this is true is especially significant if the work can be
linked without the Library, or if the work is itself a library. The
threshold for this to be true is not precisely defined by law.
If such an object file uses only numerical parameters, data
structure layouts and accessors, and small macros and small inline
functions (ten lines or less in length), then the use of the object
file is unrestricted, regardless of whether it is legally a derivative
work. (Executables containing this object code plus portions of the
Library will still fall under Section 6.)
Otherwise, if the work is a derivative of the Library, you may
distribute the object code for the work under the terms of Section 6.
Any executables containing that work also fall under Section 6,
whether or not they are linked directly with the Library itself.
6. As an exception to the Sections above, you may also combine or
link a "work that uses the Library" with the Library to produce a
work containing portions of the Library, and distribute that work
under terms of your choice, provided that the terms permit
modification of the work for the customer's own use and reverse
engineering for debugging such modifications.
You must give prominent notice with each copy of the work that the
Library is used in it and that the Library and its use are covered by
this License. You must supply a copy of this License. If the work
during execution displays copyright notices, you must include the
copyright notice for the Library among them, as well as a reference
directing the user to the copy of this License. Also, you must do one
of these things:
a) Accompany the work with the complete corresponding
machine-readable source code for the Library including whatever
changes were used in the work (which must be distributed under
Sections 1 and 2 above); and, if the work is an executable linked
with the Library, with the complete machine-readable "work that
uses the Library", as object code and/or source code, so that the
user can modify the Library and then relink to produce a modified
executable containing the modified Library. (It is understood
that the user who changes the contents of definitions files in the
Library will not necessarily be able to recompile the application
to use the modified definitions.)
b) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (1) uses at run time a
copy of the library already present on the user's computer system,
rather than copying library functions into the executable, and (2)
will operate properly with a modified version of the library, if
the user installs one, as long as the modified version is
interface-compatible with the version that the work was made with.
c) Accompany the work with a written offer, valid for at
least three years, to give the same user the materials
specified in Subsection 6a, above, for a charge no more
than the cost of performing this distribution.
d) If distribution of the work is made by offering access to copy
from a designated place, offer equivalent access to copy the above
specified materials from the same place.
e) Verify that the user has already received a copy of these
materials or that you have already sent this user a copy.
For an executable, the required form of the "work that uses the
Library" must include any data and utility programs needed for
reproducing the executable from it. However, as a special exception,
the materials to be distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on
which the executable runs, unless that component itself accompanies
the executable.
It may happen that this requirement contradicts the license
restrictions of other proprietary libraries that do not normally
accompany the operating system. Such a contradiction means you cannot
use both them and the Library together in an executable that you
distribute.
7. You may place library facilities that are a work based on the
Library side-by-side in a single library together with other library
facilities not covered by this License, and distribute such a combined
library, provided that the separate distribution of the work based on
the Library and of the other library facilities is otherwise
permitted, and provided that you do these two things:
a) Accompany the combined library with a copy of the same work
based on the Library, uncombined with any other library
facilities. This must be distributed under the terms of the
Sections above.
b) Give prominent notice with the combined library of the fact
that part of it is a work based on the Library, and explaining
where to find the accompanying uncombined form of the same work.
8. You may not copy, modify, sublicense, link with, or distribute
the Library except as expressly provided under this License. Any
attempt otherwise to copy, modify, sublicense, link with, or
distribute the Library is void, and will automatically terminate your
rights under this License. However, parties who have received copies,
or rights, from you under this License will not have their licenses
terminated so long as such parties remain in full compliance.
9. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Library or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Library (or any work based on the
Library), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Library or works based on it.
10. Each time you redistribute the Library (or any work based on the
Library), the recipient automatically receives a license from the
original licensor to copy, distribute, link with or modify the Library
subject to these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties with
this License.
11. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Library at all. For example, if a patent
license would not permit royalty-free redistribution of the Library by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Library.
If any portion of this section is held invalid or unenforceable under any
particular circumstance, the balance of the section is intended to apply,
and the section as a whole is intended to apply in other circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
12. If the distribution and/or use of the Library is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Library under this License may add
an explicit geographical distribution limitation excluding those countries,
so that distribution is permitted only in or among countries not thus
excluded. In such case, this License incorporates the limitation as if
written in the body of this License.
13. The Free Software Foundation may publish revised and/or new
versions of the Lesser General Public License from time to time.
Such new versions will be similar in spirit to the present version,
but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Library
specifies a version number of this License which applies to it and
"any later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Library does not specify a
license version number, you may choose any version ever published by
the Free Software Foundation.
14. If you wish to incorporate parts of the Library into other free
programs whose distribution conditions are incompatible with these,
write to the author to ask for permission. For software which is
copyrighted by the Free Software Foundation, write to the Free
Software Foundation; we sometimes make exceptions for this. Our
decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Libraries
If you develop a new library, and you want it to be of the greatest
possible use to the public, we recommend making it free software that
everyone can redistribute and change. You can do so by permitting
redistribution under these terms (or, alternatively, under the terms of the
ordinary General Public License).
To apply these terms, attach the following notices to the library. It is
safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.
<one line to give the library's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Also add information on how to contact you by electronic and paper mail.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the library, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
<signature of Ty Coon>, 1 April 1990
Ty Coon, President of Vice
That's all there is to it!

View File

@ -1,2 +0,0 @@
set path=c:\dev\msys32\bin;%path% & gcc -Wall -shared -o opencv_ffmpeg.dll -O2 -x c++ -I../include -I../include/ffmpeg_ -I../../modules/highgui/src ffopencv.c -L../lib -lavformat -lavcodec -lavdevice -lswscale -lavutil -lws2_32
set path=c:\dev\msys64\bin;%path% & gcc -m64 -Wall -shared -o opencv_ffmpeg_64.dll -O2 -x c++ -I../include -I../include/ffmpeg_ -I../../modules/highgui/src ffopencv.c -L../lib -lavformat64 -lavcodec64 -lavdevice64 -lswscale64 -lavutil64 -lws2_32

Binary file not shown.

Binary file not shown.

View File

@ -1,42 +1,38 @@
The build script is to be fixed.
Right now it assumes that 32-bit MinGW is in the system path and
64-bit mingw is installed to c:\Apps\MinGW64.
* On Linux and other Unix flavors OpenCV uses default or user-built ffmpeg/libav libraries.
If user builds ffmpeg/libav from source and wants OpenCV to stay BSD library, not GPL/LGPL,
he/she should use --enabled-shared configure flag and make sure that no GPL components are
enabled (some notable examples are x264 (H264 encoder) and libac3 (Dolby AC3 audio codec)).
See https://www.ffmpeg.org/legal.html for details.
It is important that gcc is used, not g++!
Otherwise the produced DLL will likely be dependent on libgcc_s_dw2-1.dll or similar DLL.
While we want to make the DLLs with minimum dependencies: Win32 libraries + msvcrt.dll.
If you want to play very safe and do not want to use FFMPEG at all, regardless of whether it's installed on
your system or not, configure and build OpenCV using CMake with WITH_FFMPEG=OFF flag. OpenCV will then use
AVFoundation (OSX), GStreamer (Linux) or other available backends supported by opencv_videoio module.
ffopencv.c is really a C++ source, hence -x c++ is used.
There is also our self-contained motion jpeg codec, which you can use without any worries.
It handles CV_FOURCC('M', 'J', 'P', 'G') streams within an AVI container (".avi").
How to update opencv_ffmpeg.dll and opencv_ffmpeg_64.dll when a new version of FFMPEG is release?
* On Windows OpenCV uses pre-built ffmpeg binaries, built with proper flags (without GPL components) and
wrapped with simple, stable OpenCV-compatible API.
The binaries are opencv_ffmpeg.dll (version for 32-bit Windows) and
opencv_ffmpeg_64.dll (version for 64-bit Windows).
1. Install 32-bit MinGW + MSYS from
http://sourceforge.net/projects/mingw/files/Automated%20MinGW%20Installer/mingw-get-inst/
Let's assume, it's installed in C:\MSYS32.
2. Install 64-bit MinGW. http://mingw-w64.sourceforge.net/
Let's assume, it's installed in C:\MSYS64
3. Copy C:\MSYS32\msys to C:\MSYS64\msys. Edit C:\MSYS64\msys\etc\fstab, change C:\MSYS32 to C:\MSYS64.
See build_win32.txt for the build instructions, if you want to rebuild opencv_ffmpeg*.dll from scratch.
4. Now you have working MSYS32 and MSYS64 environments.
Launch, one by one, C:\MSYS32\msys\msys.bat and C:\MSYS64\msys\msys.bat to create your home directories.
The pre-built opencv_ffmpeg*.dll is:
* LGPL library, not BSD libraries.
* Loaded at runtime by opencv_videoio module.
If it succeeds, ffmpeg can be used to decode/encode videos;
otherwise, other API is used.
4. Download ffmpeg-x.y.z.tar.gz (where x.y.z denotes the actual ffmpeg version).
Copy it to C:\MSYS{32|64}\msys\home\<loginname> directory.
FFMPEG build contains H264 encoder based on the OpenH264 library, that should be installed separatelly.
OpenH264 Video Codec provided by Cisco Systems, Inc.
See https://github.com/cisco/openh264/releases for details and OpenH264 license.
Downloaded binary file can be placed into global system path (System32 or SysWOW64) or near application binaries.
You can also specify location of binary file via OPENH264_LIBRARY_PATH environment variable.
5. To build 32-bit ffmpeg libraries, run C:\MSYS32\msys\msys.bat and type the following commands:
If LGPL/GPL software can not be supplied with your OpenCV-based product, simply exclude
opencv_ffmpeg*.dll from your distribution; OpenCV will stay fully functional except for the ability to
decode/encode videos using FFMPEG (though, it may still be able to do that using other API,
such as Video for Windows, Windows Media Foundation or our self-contained motion jpeg codec).
5.1. tar -xzf ffmpeg-x.y.z.tar.gz
5.2. mkdir build
5.3. cd build
5.4. ../ffmpeg-x.y.z/configure --enable-w32threads
5.5. make
5.6. make install
5.7. cd /local/lib
5.8. strip -g *.a
6. Then repeat the same for 64-bit case. The output libs: libavcodec.a etc. need to be renamed to libavcodec64.a etc.
7. Then, copy all those libs to <opencv>\3rdparty\lib\, copy the headers to <opencv>\3rdparty\include\ffmpeg_.
8. Then, go to <opencv>\3rdparty\ffmpeg, edit make.bat
(change paths to the actual paths to your msys32 and msys64 distributions) and then run make.bat
See license.txt for the FFMPEG copyright notice and the licensing terms.

File diff suppressed because it is too large Load Diff

View File

@ -1,116 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_AVFFT_H
#define AVCODEC_AVFFT_H
/**
* @file
* @ingroup lavc_fft
* FFT functions
*/
/**
* @defgroup lavc_fft FFT functions
* @ingroup lavc_misc
*
* @{
*/
typedef float FFTSample;
typedef struct FFTComplex {
FFTSample re, im;
} FFTComplex;
typedef struct FFTContext FFTContext;
/**
* Set up a complex FFT.
* @param nbits log2 of the length of the input array
* @param inverse if 0 perform the forward transform, if 1 perform the inverse
*/
FFTContext *av_fft_init(int nbits, int inverse);
/**
* Do the permutation needed BEFORE calling ff_fft_calc().
*/
void av_fft_permute(FFTContext *s, FFTComplex *z);
/**
* Do a complex FFT with the parameters defined in av_fft_init(). The
* input data must be permuted before. No 1.0/sqrt(n) normalization is done.
*/
void av_fft_calc(FFTContext *s, FFTComplex *z);
void av_fft_end(FFTContext *s);
FFTContext *av_mdct_init(int nbits, int inverse, double scale);
void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input);
void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
void av_mdct_end(FFTContext *s);
/* Real Discrete Fourier Transform */
enum RDFTransformType {
DFT_R2C,
IDFT_C2R,
IDFT_R2C,
DFT_C2R,
};
typedef struct RDFTContext RDFTContext;
/**
* Set up a real FFT.
* @param nbits log2 of the length of the input array
* @param trans the type of transform
*/
RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans);
void av_rdft_calc(RDFTContext *s, FFTSample *data);
void av_rdft_end(RDFTContext *s);
/* Discrete Cosine Transform */
typedef struct DCTContext DCTContext;
enum DCTTransformType {
DCT_II = 0,
DCT_III,
DCT_I,
DST_I,
};
/**
* Set up DCT.
* @param nbits size of the input array:
* (1 << nbits) for DCT-II, DCT-III and DST-I
* (1 << nbits) + 1 for DCT-I
*
* @note the first element of the input of DST-I is ignored
*/
DCTContext *av_dct_init(int nbits, enum DCTTransformType type);
void av_dct_calc(DCTContext *s, FFTSample *data);
void av_dct_end (DCTContext *s);
/**
* @}
*/
#endif /* AVCODEC_AVFFT_H */

View File

@ -1,95 +0,0 @@
/*
* DXVA2 HW acceleration
*
* copyright (c) 2009 Laurent Aimar
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_DXVA_H
#define AVCODEC_DXVA_H
/**
* @file
* @ingroup lavc_codec_hwaccel_dxva2
* Public libavcodec DXVA2 header.
*/
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0600
#undef _WIN32_WINNT
#endif
#if !defined(_WIN32_WINNT)
#define _WIN32_WINNT 0x0600
#endif
#include <stdint.h>
#include <d3d9.h>
#include <dxva2api.h>
/**
* @defgroup lavc_codec_hwaccel_dxva2 DXVA2
* @ingroup lavc_codec_hwaccel
*
* @{
*/
#define FF_DXVA2_WORKAROUND_SCALING_LIST_ZIGZAG 1 ///< Work around for DXVA2 and old UVD/UVD+ ATI video cards
/**
* This structure is used to provides the necessary configurations and data
* to the DXVA2 FFmpeg HWAccel implementation.
*
* The application must make it available as AVCodecContext.hwaccel_context.
*/
struct dxva_context {
/**
* DXVA2 decoder object
*/
IDirectXVideoDecoder *decoder;
/**
* DXVA2 configuration used to create the decoder
*/
const DXVA2_ConfigPictureDecode *cfg;
/**
* The number of surface in the surface array
*/
unsigned surface_count;
/**
* The array of Direct3D surfaces used to create the decoder
*/
LPDIRECT3DSURFACE9 *surface;
/**
* A bit field configuring the workarounds needed for using the decoder
*/
uint64_t workaround;
/**
* Private to the FFmpeg AVHWAccel implementation
*/
unsigned report_id;
};
/**
* @}
*/
#endif /* AVCODEC_DXVA_H */

View File

@ -1,397 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_OLD_CODEC_IDS_H
#define AVCODEC_OLD_CODEC_IDS_H
#include "libavutil/common.h"
/*
* This header exists to prevent new codec IDs from being accidentally added to
* the deprecated list.
* Do not include it directly. It will be removed on next major bump
*
* Do not add new items to this list. Use the AVCodecID enum instead.
*/
CODEC_ID_NONE = AV_CODEC_ID_NONE,
/* video codecs */
CODEC_ID_MPEG1VIDEO,
CODEC_ID_MPEG2VIDEO, ///< preferred ID for MPEG-1/2 video decoding
CODEC_ID_MPEG2VIDEO_XVMC,
CODEC_ID_H261,
CODEC_ID_H263,
CODEC_ID_RV10,
CODEC_ID_RV20,
CODEC_ID_MJPEG,
CODEC_ID_MJPEGB,
CODEC_ID_LJPEG,
CODEC_ID_SP5X,
CODEC_ID_JPEGLS,
CODEC_ID_MPEG4,
CODEC_ID_RAWVIDEO,
CODEC_ID_MSMPEG4V1,
CODEC_ID_MSMPEG4V2,
CODEC_ID_MSMPEG4V3,
CODEC_ID_WMV1,
CODEC_ID_WMV2,
CODEC_ID_H263P,
CODEC_ID_H263I,
CODEC_ID_FLV1,
CODEC_ID_SVQ1,
CODEC_ID_SVQ3,
CODEC_ID_DVVIDEO,
CODEC_ID_HUFFYUV,
CODEC_ID_CYUV,
CODEC_ID_H264,
CODEC_ID_INDEO3,
CODEC_ID_VP3,
CODEC_ID_THEORA,
CODEC_ID_ASV1,
CODEC_ID_ASV2,
CODEC_ID_FFV1,
CODEC_ID_4XM,
CODEC_ID_VCR1,
CODEC_ID_CLJR,
CODEC_ID_MDEC,
CODEC_ID_ROQ,
CODEC_ID_INTERPLAY_VIDEO,
CODEC_ID_XAN_WC3,
CODEC_ID_XAN_WC4,
CODEC_ID_RPZA,
CODEC_ID_CINEPAK,
CODEC_ID_WS_VQA,
CODEC_ID_MSRLE,
CODEC_ID_MSVIDEO1,
CODEC_ID_IDCIN,
CODEC_ID_8BPS,
CODEC_ID_SMC,
CODEC_ID_FLIC,
CODEC_ID_TRUEMOTION1,
CODEC_ID_VMDVIDEO,
CODEC_ID_MSZH,
CODEC_ID_ZLIB,
CODEC_ID_QTRLE,
CODEC_ID_TSCC,
CODEC_ID_ULTI,
CODEC_ID_QDRAW,
CODEC_ID_VIXL,
CODEC_ID_QPEG,
CODEC_ID_PNG,
CODEC_ID_PPM,
CODEC_ID_PBM,
CODEC_ID_PGM,
CODEC_ID_PGMYUV,
CODEC_ID_PAM,
CODEC_ID_FFVHUFF,
CODEC_ID_RV30,
CODEC_ID_RV40,
CODEC_ID_VC1,
CODEC_ID_WMV3,
CODEC_ID_LOCO,
CODEC_ID_WNV1,
CODEC_ID_AASC,
CODEC_ID_INDEO2,
CODEC_ID_FRAPS,
CODEC_ID_TRUEMOTION2,
CODEC_ID_BMP,
CODEC_ID_CSCD,
CODEC_ID_MMVIDEO,
CODEC_ID_ZMBV,
CODEC_ID_AVS,
CODEC_ID_SMACKVIDEO,
CODEC_ID_NUV,
CODEC_ID_KMVC,
CODEC_ID_FLASHSV,
CODEC_ID_CAVS,
CODEC_ID_JPEG2000,
CODEC_ID_VMNC,
CODEC_ID_VP5,
CODEC_ID_VP6,
CODEC_ID_VP6F,
CODEC_ID_TARGA,
CODEC_ID_DSICINVIDEO,
CODEC_ID_TIERTEXSEQVIDEO,
CODEC_ID_TIFF,
CODEC_ID_GIF,
CODEC_ID_DXA,
CODEC_ID_DNXHD,
CODEC_ID_THP,
CODEC_ID_SGI,
CODEC_ID_C93,
CODEC_ID_BETHSOFTVID,
CODEC_ID_PTX,
CODEC_ID_TXD,
CODEC_ID_VP6A,
CODEC_ID_AMV,
CODEC_ID_VB,
CODEC_ID_PCX,
CODEC_ID_SUNRAST,
CODEC_ID_INDEO4,
CODEC_ID_INDEO5,
CODEC_ID_MIMIC,
CODEC_ID_RL2,
CODEC_ID_ESCAPE124,
CODEC_ID_DIRAC,
CODEC_ID_BFI,
CODEC_ID_CMV,
CODEC_ID_MOTIONPIXELS,
CODEC_ID_TGV,
CODEC_ID_TGQ,
CODEC_ID_TQI,
CODEC_ID_AURA,
CODEC_ID_AURA2,
CODEC_ID_V210X,
CODEC_ID_TMV,
CODEC_ID_V210,
CODEC_ID_DPX,
CODEC_ID_MAD,
CODEC_ID_FRWU,
CODEC_ID_FLASHSV2,
CODEC_ID_CDGRAPHICS,
CODEC_ID_R210,
CODEC_ID_ANM,
CODEC_ID_BINKVIDEO,
CODEC_ID_IFF_ILBM,
CODEC_ID_IFF_BYTERUN1,
CODEC_ID_KGV1,
CODEC_ID_YOP,
CODEC_ID_VP8,
CODEC_ID_PICTOR,
CODEC_ID_ANSI,
CODEC_ID_A64_MULTI,
CODEC_ID_A64_MULTI5,
CODEC_ID_R10K,
CODEC_ID_MXPEG,
CODEC_ID_LAGARITH,
CODEC_ID_PRORES,
CODEC_ID_JV,
CODEC_ID_DFA,
CODEC_ID_WMV3IMAGE,
CODEC_ID_VC1IMAGE,
CODEC_ID_UTVIDEO,
CODEC_ID_BMV_VIDEO,
CODEC_ID_VBLE,
CODEC_ID_DXTORY,
CODEC_ID_V410,
CODEC_ID_XWD,
CODEC_ID_CDXL,
CODEC_ID_XBM,
CODEC_ID_ZEROCODEC,
CODEC_ID_MSS1,
CODEC_ID_MSA1,
CODEC_ID_TSCC2,
CODEC_ID_MTS2,
CODEC_ID_CLLC,
CODEC_ID_Y41P = MKBETAG('Y','4','1','P'),
CODEC_ID_ESCAPE130 = MKBETAG('E','1','3','0'),
CODEC_ID_EXR = MKBETAG('0','E','X','R'),
CODEC_ID_AVRP = MKBETAG('A','V','R','P'),
CODEC_ID_G2M = MKBETAG( 0 ,'G','2','M'),
CODEC_ID_AVUI = MKBETAG('A','V','U','I'),
CODEC_ID_AYUV = MKBETAG('A','Y','U','V'),
CODEC_ID_V308 = MKBETAG('V','3','0','8'),
CODEC_ID_V408 = MKBETAG('V','4','0','8'),
CODEC_ID_YUV4 = MKBETAG('Y','U','V','4'),
CODEC_ID_SANM = MKBETAG('S','A','N','M'),
CODEC_ID_PAF_VIDEO = MKBETAG('P','A','F','V'),
CODEC_ID_SNOW = AV_CODEC_ID_SNOW,
/* various PCM "codecs" */
CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs
CODEC_ID_PCM_S16LE = 0x10000,
CODEC_ID_PCM_S16BE,
CODEC_ID_PCM_U16LE,
CODEC_ID_PCM_U16BE,
CODEC_ID_PCM_S8,
CODEC_ID_PCM_U8,
CODEC_ID_PCM_MULAW,
CODEC_ID_PCM_ALAW,
CODEC_ID_PCM_S32LE,
CODEC_ID_PCM_S32BE,
CODEC_ID_PCM_U32LE,
CODEC_ID_PCM_U32BE,
CODEC_ID_PCM_S24LE,
CODEC_ID_PCM_S24BE,
CODEC_ID_PCM_U24LE,
CODEC_ID_PCM_U24BE,
CODEC_ID_PCM_S24DAUD,
CODEC_ID_PCM_ZORK,
CODEC_ID_PCM_S16LE_PLANAR,
CODEC_ID_PCM_DVD,
CODEC_ID_PCM_F32BE,
CODEC_ID_PCM_F32LE,
CODEC_ID_PCM_F64BE,
CODEC_ID_PCM_F64LE,
CODEC_ID_PCM_BLURAY,
CODEC_ID_PCM_LXF,
CODEC_ID_S302M,
CODEC_ID_PCM_S8_PLANAR,
/* various ADPCM codecs */
CODEC_ID_ADPCM_IMA_QT = 0x11000,
CODEC_ID_ADPCM_IMA_WAV,
CODEC_ID_ADPCM_IMA_DK3,
CODEC_ID_ADPCM_IMA_DK4,
CODEC_ID_ADPCM_IMA_WS,
CODEC_ID_ADPCM_IMA_SMJPEG,
CODEC_ID_ADPCM_MS,
CODEC_ID_ADPCM_4XM,
CODEC_ID_ADPCM_XA,
CODEC_ID_ADPCM_ADX,
CODEC_ID_ADPCM_EA,
CODEC_ID_ADPCM_G726,
CODEC_ID_ADPCM_CT,
CODEC_ID_ADPCM_SWF,
CODEC_ID_ADPCM_YAMAHA,
CODEC_ID_ADPCM_SBPRO_4,
CODEC_ID_ADPCM_SBPRO_3,
CODEC_ID_ADPCM_SBPRO_2,
CODEC_ID_ADPCM_THP,
CODEC_ID_ADPCM_IMA_AMV,
CODEC_ID_ADPCM_EA_R1,
CODEC_ID_ADPCM_EA_R3,
CODEC_ID_ADPCM_EA_R2,
CODEC_ID_ADPCM_IMA_EA_SEAD,
CODEC_ID_ADPCM_IMA_EA_EACS,
CODEC_ID_ADPCM_EA_XAS,
CODEC_ID_ADPCM_EA_MAXIS_XA,
CODEC_ID_ADPCM_IMA_ISS,
CODEC_ID_ADPCM_G722,
CODEC_ID_ADPCM_IMA_APC,
CODEC_ID_VIMA = MKBETAG('V','I','M','A'),
/* AMR */
CODEC_ID_AMR_NB = 0x12000,
CODEC_ID_AMR_WB,
/* RealAudio codecs*/
CODEC_ID_RA_144 = 0x13000,
CODEC_ID_RA_288,
/* various DPCM codecs */
CODEC_ID_ROQ_DPCM = 0x14000,
CODEC_ID_INTERPLAY_DPCM,
CODEC_ID_XAN_DPCM,
CODEC_ID_SOL_DPCM,
/* audio codecs */
CODEC_ID_MP2 = 0x15000,
CODEC_ID_MP3, ///< preferred ID for decoding MPEG audio layer 1, 2 or 3
CODEC_ID_AAC,
CODEC_ID_AC3,
CODEC_ID_DTS,
CODEC_ID_VORBIS,
CODEC_ID_DVAUDIO,
CODEC_ID_WMAV1,
CODEC_ID_WMAV2,
CODEC_ID_MACE3,
CODEC_ID_MACE6,
CODEC_ID_VMDAUDIO,
CODEC_ID_FLAC,
CODEC_ID_MP3ADU,
CODEC_ID_MP3ON4,
CODEC_ID_SHORTEN,
CODEC_ID_ALAC,
CODEC_ID_WESTWOOD_SND1,
CODEC_ID_GSM, ///< as in Berlin toast format
CODEC_ID_QDM2,
CODEC_ID_COOK,
CODEC_ID_TRUESPEECH,
CODEC_ID_TTA,
CODEC_ID_SMACKAUDIO,
CODEC_ID_QCELP,
CODEC_ID_WAVPACK,
CODEC_ID_DSICINAUDIO,
CODEC_ID_IMC,
CODEC_ID_MUSEPACK7,
CODEC_ID_MLP,
CODEC_ID_GSM_MS, /* as found in WAV */
CODEC_ID_ATRAC3,
CODEC_ID_VOXWARE,
CODEC_ID_APE,
CODEC_ID_NELLYMOSER,
CODEC_ID_MUSEPACK8,
CODEC_ID_SPEEX,
CODEC_ID_WMAVOICE,
CODEC_ID_WMAPRO,
CODEC_ID_WMALOSSLESS,
CODEC_ID_ATRAC3P,
CODEC_ID_EAC3,
CODEC_ID_SIPR,
CODEC_ID_MP1,
CODEC_ID_TWINVQ,
CODEC_ID_TRUEHD,
CODEC_ID_MP4ALS,
CODEC_ID_ATRAC1,
CODEC_ID_BINKAUDIO_RDFT,
CODEC_ID_BINKAUDIO_DCT,
CODEC_ID_AAC_LATM,
CODEC_ID_QDMC,
CODEC_ID_CELT,
CODEC_ID_G723_1,
CODEC_ID_G729,
CODEC_ID_8SVX_EXP,
CODEC_ID_8SVX_FIB,
CODEC_ID_BMV_AUDIO,
CODEC_ID_RALF,
CODEC_ID_IAC,
CODEC_ID_ILBC,
CODEC_ID_FFWAVESYNTH = MKBETAG('F','F','W','S'),
CODEC_ID_SONIC = MKBETAG('S','O','N','C'),
CODEC_ID_SONIC_LS = MKBETAG('S','O','N','L'),
CODEC_ID_PAF_AUDIO = MKBETAG('P','A','F','A'),
CODEC_ID_OPUS = MKBETAG('O','P','U','S'),
/* subtitle codecs */
CODEC_ID_FIRST_SUBTITLE = 0x17000, ///< A dummy ID pointing at the start of subtitle codecs.
CODEC_ID_DVD_SUBTITLE = 0x17000,
CODEC_ID_DVB_SUBTITLE,
CODEC_ID_TEXT, ///< raw UTF-8 text
CODEC_ID_XSUB,
CODEC_ID_SSA,
CODEC_ID_MOV_TEXT,
CODEC_ID_HDMV_PGS_SUBTITLE,
CODEC_ID_DVB_TELETEXT,
CODEC_ID_SRT,
CODEC_ID_MICRODVD = MKBETAG('m','D','V','D'),
CODEC_ID_EIA_608 = MKBETAG('c','6','0','8'),
CODEC_ID_JACOSUB = MKBETAG('J','S','U','B'),
CODEC_ID_SAMI = MKBETAG('S','A','M','I'),
CODEC_ID_REALTEXT = MKBETAG('R','T','X','T'),
CODEC_ID_SUBVIEWER = MKBETAG('S','u','b','V'),
/* other specific kind of codecs (generally used for attachments) */
CODEC_ID_FIRST_UNKNOWN = 0x18000, ///< A dummy ID pointing at the start of various fake codecs.
CODEC_ID_TTF = 0x18000,
CODEC_ID_BINTEXT = MKBETAG('B','T','X','T'),
CODEC_ID_XBIN = MKBETAG('X','B','I','N'),
CODEC_ID_IDF = MKBETAG( 0 ,'I','D','F'),
CODEC_ID_OTF = MKBETAG( 0 ,'O','T','F'),
CODEC_ID_PROBE = 0x19000, ///< codec_id is not known (like CODEC_ID_NONE) but lavf should attempt to identify it
CODEC_ID_MPEG2TS = 0x20000, /**< _FAKE_ codec to indicate a raw MPEG-2 TS
* stream (only used by libavformat) */
CODEC_ID_MPEG4SYSTEMS = 0x20001, /**< _FAKE_ codec to indicate a MPEG-4 Systems
* stream (only used by libavformat) */
CODEC_ID_FFMETADATA = 0x21000, ///< Dummy codec for streams containing only metadata information.
#endif /* AVCODEC_OLD_CODEC_IDS_H */

View File

@ -1,173 +0,0 @@
/*
* Video Acceleration API (shared data between FFmpeg and the video player)
* HW decode acceleration for MPEG-2, MPEG-4, H.264 and VC-1
*
* Copyright (C) 2008-2009 Splitted-Desktop Systems
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_VAAPI_H
#define AVCODEC_VAAPI_H
/**
* @file
* @ingroup lavc_codec_hwaccel_vaapi
* Public libavcodec VA API header.
*/
#include <stdint.h>
/**
* @defgroup lavc_codec_hwaccel_vaapi VA API Decoding
* @ingroup lavc_codec_hwaccel
* @{
*/
/**
* This structure is used to share data between the FFmpeg library and
* the client video application.
* This shall be zero-allocated and available as
* AVCodecContext.hwaccel_context. All user members can be set once
* during initialization or through each AVCodecContext.get_buffer()
* function call. In any case, they must be valid prior to calling
* decoding functions.
*/
struct vaapi_context {
/**
* Window system dependent data
*
* - encoding: unused
* - decoding: Set by user
*/
void *display;
/**
* Configuration ID
*
* - encoding: unused
* - decoding: Set by user
*/
uint32_t config_id;
/**
* Context ID (video decode pipeline)
*
* - encoding: unused
* - decoding: Set by user
*/
uint32_t context_id;
/**
* VAPictureParameterBuffer ID
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
uint32_t pic_param_buf_id;
/**
* VAIQMatrixBuffer ID
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
uint32_t iq_matrix_buf_id;
/**
* VABitPlaneBuffer ID (for VC-1 decoding)
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
uint32_t bitplane_buf_id;
/**
* Slice parameter/data buffer IDs
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
uint32_t *slice_buf_ids;
/**
* Number of effective slice buffer IDs to send to the HW
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
unsigned int n_slice_buf_ids;
/**
* Size of pre-allocated slice_buf_ids
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
unsigned int slice_buf_ids_alloc;
/**
* Pointer to VASliceParameterBuffers
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
void *slice_params;
/**
* Size of a VASliceParameterBuffer element
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
unsigned int slice_param_size;
/**
* Size of pre-allocated slice_params
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
unsigned int slice_params_alloc;
/**
* Number of slices currently filled in
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
unsigned int slice_count;
/**
* Pointer to slice data buffer base
* - encoding: unused
* - decoding: Set by libavcodec
*/
const uint8_t *slice_data;
/**
* Current size of slice data
*
* - encoding: unused
* - decoding: Set by libavcodec
*/
uint32_t slice_data_size;
};
/* @} */
#endif /* AVCODEC_VAAPI_H */

View File

@ -1,162 +0,0 @@
/*
* VDA HW acceleration
*
* copyright (c) 2011 Sebastien Zwickert
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_VDA_H
#define AVCODEC_VDA_H
/**
* @file
* @ingroup lavc_codec_hwaccel_vda
* Public libavcodec VDA header.
*/
#include <stdint.h>
// emmintrin.h is unable to compile with -std=c99 -Werror=missing-prototypes
// http://openradar.appspot.com/8026390
#undef __GNUC_STDC_INLINE__
#define Picture QuickdrawPicture
#include <VideoDecodeAcceleration/VDADecoder.h>
#undef Picture
#include "libavcodec/version.h"
/**
* @defgroup lavc_codec_hwaccel_vda VDA
* @ingroup lavc_codec_hwaccel
*
* @{
*/
/**
* This structure is used to provide the necessary configurations and data
* to the VDA FFmpeg HWAccel implementation.
*
* The application must make it available as AVCodecContext.hwaccel_context.
*/
struct vda_context {
/**
* VDA decoder object.
*
* - encoding: unused
* - decoding: Set/Unset by libavcodec.
*/
VDADecoder decoder;
/**
* The Core Video pixel buffer that contains the current image data.
*
* encoding: unused
* decoding: Set by libavcodec. Unset by user.
*/
CVPixelBufferRef cv_buffer;
/**
* Use the hardware decoder in synchronous mode.
*
* encoding: unused
* decoding: Set by user.
*/
int use_sync_decoding;
/**
* The frame width.
*
* - encoding: unused
* - decoding: Set/Unset by user.
*/
int width;
/**
* The frame height.
*
* - encoding: unused
* - decoding: Set/Unset by user.
*/
int height;
/**
* The frame format.
*
* - encoding: unused
* - decoding: Set/Unset by user.
*/
int format;
/**
* The pixel format for output image buffers.
*
* - encoding: unused
* - decoding: Set/Unset by user.
*/
OSType cv_pix_fmt_type;
/**
* The current bitstream buffer.
*
* - encoding: unused
* - decoding: Set/Unset by libavcodec.
*/
uint8_t *priv_bitstream;
/**
* The current size of the bitstream.
*
* - encoding: unused
* - decoding: Set/Unset by libavcodec.
*/
int priv_bitstream_size;
/**
* The reference size used for fast reallocation.
*
* - encoding: unused
* - decoding: Set/Unset by libavcodec.
*/
int priv_allocated_size;
/**
* Use av_buffer to manage buffer.
* When the flag is set, the CVPixelBuffers returned by the decoder will
* be released automatically, so you have to retain them if necessary.
* Not setting this flag may cause memory leak.
*
* encoding: unused
* decoding: Set by user.
*/
int use_ref_buffer;
};
/** Create the video decoder. */
int ff_vda_create_decoder(struct vda_context *vda_ctx,
uint8_t *extradata,
int extradata_size);
/** Destroy the video decoder. */
int ff_vda_destroy_decoder(struct vda_context *vda_ctx);
/**
* @}
*/
#endif /* AVCODEC_VDA_H */

View File

@ -1,159 +0,0 @@
/*
* The Video Decode and Presentation API for UNIX (VDPAU) is used for
* hardware-accelerated decoding of MPEG-1/2, H.264 and VC-1.
*
* Copyright (C) 2008 NVIDIA
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_VDPAU_H
#define AVCODEC_VDPAU_H
/**
* @file
* @ingroup lavc_codec_hwaccel_vdpau
* Public libavcodec VDPAU header.
*/
/**
* @defgroup lavc_codec_hwaccel_vdpau VDPAU Decoder and Renderer
* @ingroup lavc_codec_hwaccel
*
* VDPAU hardware acceleration has two modules
* - VDPAU decoding
* - VDPAU presentation
*
* The VDPAU decoding module parses all headers using FFmpeg
* parsing mechanisms and uses VDPAU for the actual decoding.
*
* As per the current implementation, the actual decoding
* and rendering (API calls) are done as part of the VDPAU
* presentation (vo_vdpau.c) module.
*
* @{
*/
#include <vdpau/vdpau.h>
#include <vdpau/vdpau_x11.h>
#include "libavutil/avconfig.h"
union FFVdpPictureInfo {
VdpPictureInfoH264 h264;
VdpPictureInfoMPEG1Or2 mpeg;
VdpPictureInfoVC1 vc1;
VdpPictureInfoMPEG4Part2 mpeg4;
};
/**
* This structure is used to share data between the libavcodec library and
* the client video application.
* The user shall zero-allocate the structure and make it available as
* AVCodecContext.hwaccel_context. Members can be set by the user once
* during initialization or through each AVCodecContext.get_buffer()
* function call. In any case, they must be valid prior to calling
* decoding functions.
*/
typedef struct AVVDPAUContext {
/**
* VDPAU decoder handle
*
* Set by user.
*/
VdpDecoder decoder;
/**
* VDPAU decoder render callback
*
* Set by the user.
*/
VdpDecoderRender *render;
/**
* VDPAU picture information
*
* Set by libavcodec.
*/
union FFVdpPictureInfo info;
/**
* Allocated size of the bitstream_buffers table.
*
* Set by libavcodec.
*/
int bitstream_buffers_allocated;
/**
* Useful bitstream buffers in the bitstream buffers table.
*
* Set by libavcodec.
*/
int bitstream_buffers_used;
/**
* Table of bitstream buffers.
* The user is responsible for freeing this buffer using av_freep().
*
* Set by libavcodec.
*/
VdpBitstreamBuffer *bitstream_buffers;
} AVVDPAUContext;
/** @brief The videoSurface is used for rendering. */
#define FF_VDPAU_STATE_USED_FOR_RENDER 1
/**
* @brief The videoSurface is needed for reference/prediction.
* The codec manipulates this.
*/
#define FF_VDPAU_STATE_USED_FOR_REFERENCE 2
/**
* @brief This structure is used as a callback between the FFmpeg
* decoder (vd_) and presentation (vo_) module.
* This is used for defining a video frame containing surface,
* picture parameter, bitstream information etc which are passed
* between the FFmpeg decoder and its clients.
*/
struct vdpau_render_state {
VdpVideoSurface surface; ///< Used as rendered surface, never changed.
int state; ///< Holds FF_VDPAU_STATE_* values.
#if AV_HAVE_INCOMPATIBLE_LIBAV_ABI
/** picture parameter information for all supported codecs */
union FFVdpPictureInfo info;
#endif
/** Describe size/location of the compressed video data.
Set to 0 when freeing bitstream_buffers. */
int bitstream_buffers_allocated;
int bitstream_buffers_used;
/** The user is responsible for freeing this buffer using av_freep(). */
VdpBitstreamBuffer *bitstream_buffers;
#if !AV_HAVE_INCOMPATIBLE_LIBAV_ABI
/** picture parameter information for all supported codecs */
union FFVdpPictureInfo info;
#endif
};
/* @}*/
#endif /* AVCODEC_VDPAU_H */

View File

@ -1,95 +0,0 @@
/*
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_VERSION_H
#define AVCODEC_VERSION_H
/**
* @file
* @ingroup libavc
* Libavcodec version macros.
*/
#include "libavutil/avutil.h"
#define LIBAVCODEC_VERSION_MAJOR 55
#define LIBAVCODEC_VERSION_MINOR 18
#define LIBAVCODEC_VERSION_MICRO 102
#define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
LIBAVCODEC_VERSION_MINOR, \
LIBAVCODEC_VERSION_MICRO)
#define LIBAVCODEC_VERSION AV_VERSION(LIBAVCODEC_VERSION_MAJOR, \
LIBAVCODEC_VERSION_MINOR, \
LIBAVCODEC_VERSION_MICRO)
#define LIBAVCODEC_BUILD LIBAVCODEC_VERSION_INT
#define LIBAVCODEC_IDENT "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION)
/**
* FF_API_* defines may be placed below to indicate public API that will be
* dropped at a future version bump. The defines themselves are not part of
* the public API and may change, break or disappear at any time.
*/
#ifndef FF_API_REQUEST_CHANNELS
#define FF_API_REQUEST_CHANNELS (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_ALLOC_CONTEXT
#define FF_API_ALLOC_CONTEXT (LIBAVCODEC_VERSION_MAJOR < 55)
#endif
#ifndef FF_API_AVCODEC_OPEN
#define FF_API_AVCODEC_OPEN (LIBAVCODEC_VERSION_MAJOR < 55)
#endif
#ifndef FF_API_OLD_DECODE_AUDIO
#define FF_API_OLD_DECODE_AUDIO (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_OLD_TIMECODE
#define FF_API_OLD_TIMECODE (LIBAVCODEC_VERSION_MAJOR < 55)
#endif
#ifndef FF_API_OLD_ENCODE_AUDIO
#define FF_API_OLD_ENCODE_AUDIO (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_OLD_ENCODE_VIDEO
#define FF_API_OLD_ENCODE_VIDEO (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_CODEC_ID
#define FF_API_CODEC_ID (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_AVCODEC_RESAMPLE
#define FF_API_AVCODEC_RESAMPLE (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_DEINTERLACE
#define FF_API_DEINTERLACE (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_DESTRUCT_PACKET
#define FF_API_DESTRUCT_PACKET (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_GET_BUFFER
#define FF_API_GET_BUFFER (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_MISSING_SAMPLE
#define FF_API_MISSING_SAMPLE (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_LOWRES
#define FF_API_LOWRES (LIBAVCODEC_VERSION_MAJOR < 56)
#endif
#endif /* AVCODEC_VERSION_H */

View File

@ -1,168 +0,0 @@
/*
* Copyright (C) 2003 Ivan Kalvachev
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_XVMC_H
#define AVCODEC_XVMC_H
/**
* @file
* @ingroup lavc_codec_hwaccel_xvmc
* Public libavcodec XvMC header.
*/
#include <X11/extensions/XvMC.h>
#include "avcodec.h"
/**
* @defgroup lavc_codec_hwaccel_xvmc XvMC
* @ingroup lavc_codec_hwaccel
*
* @{
*/
#define AV_XVMC_ID 0x1DC711C0 /**< special value to ensure that regular pixel routines haven't corrupted the struct
the number is 1337 speak for the letters IDCT MCo (motion compensation) */
struct xvmc_pix_fmt {
/** The field contains the special constant value AV_XVMC_ID.
It is used as a test that the application correctly uses the API,
and that there is no corruption caused by pixel routines.
- application - set during initialization
- libavcodec - unchanged
*/
int xvmc_id;
/** Pointer to the block array allocated by XvMCCreateBlocks().
The array has to be freed by XvMCDestroyBlocks().
Each group of 64 values represents one data block of differential
pixel information (in MoCo mode) or coefficients for IDCT.
- application - set the pointer during initialization
- libavcodec - fills coefficients/pixel data into the array
*/
short* data_blocks;
/** Pointer to the macroblock description array allocated by
XvMCCreateMacroBlocks() and freed by XvMCDestroyMacroBlocks().
- application - set the pointer during initialization
- libavcodec - fills description data into the array
*/
XvMCMacroBlock* mv_blocks;
/** Number of macroblock descriptions that can be stored in the mv_blocks
array.
- application - set during initialization
- libavcodec - unchanged
*/
int allocated_mv_blocks;
/** Number of blocks that can be stored at once in the data_blocks array.
- application - set during initialization
- libavcodec - unchanged
*/
int allocated_data_blocks;
/** Indicate that the hardware would interpret data_blocks as IDCT
coefficients and perform IDCT on them.
- application - set during initialization
- libavcodec - unchanged
*/
int idct;
/** In MoCo mode it indicates that intra macroblocks are assumed to be in
unsigned format; same as the XVMC_INTRA_UNSIGNED flag.
- application - set during initialization
- libavcodec - unchanged
*/
int unsigned_intra;
/** Pointer to the surface allocated by XvMCCreateSurface().
It has to be freed by XvMCDestroySurface() on application exit.
It identifies the frame and its state on the video hardware.
- application - set during initialization
- libavcodec - unchanged
*/
XvMCSurface* p_surface;
/** Set by the decoder before calling ff_draw_horiz_band(),
needed by the XvMCRenderSurface function. */
//@{
/** Pointer to the surface used as past reference
- application - unchanged
- libavcodec - set
*/
XvMCSurface* p_past_surface;
/** Pointer to the surface used as future reference
- application - unchanged
- libavcodec - set
*/
XvMCSurface* p_future_surface;
/** top/bottom field or frame
- application - unchanged
- libavcodec - set
*/
unsigned int picture_structure;
/** XVMC_SECOND_FIELD - 1st or 2nd field in the sequence
- application - unchanged
- libavcodec - set
*/
unsigned int flags;
//}@
/** Number of macroblock descriptions in the mv_blocks array
that have already been passed to the hardware.
- application - zeroes it on get_buffer().
A successful ff_draw_horiz_band() may increment it
with filled_mb_block_num or zero both.
- libavcodec - unchanged
*/
int start_mv_blocks_num;
/** Number of new macroblock descriptions in the mv_blocks array (after
start_mv_blocks_num) that are filled by libavcodec and have to be
passed to the hardware.
- application - zeroes it on get_buffer() or after successful
ff_draw_horiz_band().
- libavcodec - increment with one of each stored MB
*/
int filled_mv_blocks_num;
/** Number of the next free data block; one data block consists of
64 short values in the data_blocks array.
All blocks before this one have already been claimed by placing their
position into the corresponding block description structure field,
that are part of the mv_blocks array.
- application - zeroes it on get_buffer().
A successful ff_draw_horiz_band() may zero it together
with start_mb_blocks_num.
- libavcodec - each decoded macroblock increases it by the number
of coded blocks it contains.
*/
int next_free_data_block_num;
};
/**
* @}
*/
#endif /* AVCODEC_XVMC_H */

View File

@ -1,69 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVDEVICE_AVDEVICE_H
#define AVDEVICE_AVDEVICE_H
#include "version.h"
/**
* @file
* @ingroup lavd
* Main libavdevice API header
*/
/**
* @defgroup lavd Special devices muxing/demuxing library
* @{
* Libavdevice is a complementary library to @ref libavf "libavformat". It
* provides various "special" platform-specific muxers and demuxers, e.g. for
* grabbing devices, audio capture and playback etc. As a consequence, the
* (de)muxers in libavdevice are of the AVFMT_NOFILE type (they use their own
* I/O functions). The filename passed to avformat_open_input() often does not
* refer to an actually existing file, but has some special device-specific
* meaning - e.g. for x11grab it is the display name.
*
* To use libavdevice, simply call avdevice_register_all() to register all
* compiled muxers and demuxers. They all use standard libavformat API.
* @}
*/
#include "libavformat/avformat.h"
/**
* Return the LIBAVDEVICE_VERSION_INT constant.
*/
unsigned avdevice_version(void);
/**
* Return the libavdevice build-time configuration.
*/
const char *avdevice_configuration(void);
/**
* Return the libavdevice license.
*/
const char *avdevice_license(void);
/**
* Initialize libavdevice and register all the input and output devices.
* @warning This function is not thread safe.
*/
void avdevice_register_all(void);
#endif /* AVDEVICE_AVDEVICE_H */

View File

@ -1,50 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVDEVICE_VERSION_H
#define AVDEVICE_VERSION_H
/**
* @file
* @ingroup lavd
* Libavdevice version macros
*/
#include "libavutil/avutil.h"
#define LIBAVDEVICE_VERSION_MAJOR 55
#define LIBAVDEVICE_VERSION_MINOR 3
#define LIBAVDEVICE_VERSION_MICRO 100
#define LIBAVDEVICE_VERSION_INT AV_VERSION_INT(LIBAVDEVICE_VERSION_MAJOR, \
LIBAVDEVICE_VERSION_MINOR, \
LIBAVDEVICE_VERSION_MICRO)
#define LIBAVDEVICE_VERSION AV_VERSION(LIBAVDEVICE_VERSION_MAJOR, \
LIBAVDEVICE_VERSION_MINOR, \
LIBAVDEVICE_VERSION_MICRO)
#define LIBAVDEVICE_BUILD LIBAVDEVICE_VERSION_INT
#define LIBAVDEVICE_IDENT "Lavd" AV_STRINGIFY(LIBAVDEVICE_VERSION)
/**
* FF_API_* defines may be placed below to indicate public API that will be
* dropped at a future version bump. The defines themselves are not part of
* the public API and may change, break or disappear at any time.
*/
#endif /* AVDEVICE_VERSION_H */

File diff suppressed because it is too large Load Diff

View File

@ -1,481 +0,0 @@
/*
* copyright (c) 2001 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVFORMAT_AVIO_H
#define AVFORMAT_AVIO_H
/**
* @file
* @ingroup lavf_io
* Buffered I/O operations
*/
#include <stdint.h>
#include "libavutil/common.h"
#include "libavutil/dict.h"
#include "libavutil/log.h"
#include "libavformat/version.h"
#define AVIO_SEEKABLE_NORMAL 0x0001 /**< Seeking works like for a local file */
/**
* Callback for checking whether to abort blocking functions.
* AVERROR_EXIT is returned in this case by the interrupted
* function. During blocking operations, callback is called with
* opaque as parameter. If the callback returns 1, the
* blocking operation will be aborted.
*
* No members can be added to this struct without a major bump, if
* new elements have been added after this struct in AVFormatContext
* or AVIOContext.
*/
typedef struct AVIOInterruptCB {
int (*callback)(void*);
void *opaque;
} AVIOInterruptCB;
/**
* Bytestream IO Context.
* New fields can be added to the end with minor version bumps.
* Removal, reordering and changes to existing fields require a major
* version bump.
* sizeof(AVIOContext) must not be used outside libav*.
*
* @note None of the function pointers in AVIOContext should be called
* directly, they should only be set by the client application
* when implementing custom I/O. Normally these are set to the
* function pointers specified in avio_alloc_context()
*/
typedef struct AVIOContext {
/**
* A class for private options.
*
* If this AVIOContext is created by avio_open2(), av_class is set and
* passes the options down to protocols.
*
* If this AVIOContext is manually allocated, then av_class may be set by
* the caller.
*
* warning -- this field can be NULL, be sure to not pass this AVIOContext
* to any av_opt_* functions in that case.
*/
const AVClass *av_class;
unsigned char *buffer; /**< Start of the buffer. */
int buffer_size; /**< Maximum buffer size */
unsigned char *buf_ptr; /**< Current position in the buffer */
unsigned char *buf_end; /**< End of the data, may be less than
buffer+buffer_size if the read function returned
less data than requested, e.g. for streams where
no more data has been received yet. */
void *opaque; /**< A private pointer, passed to the read/write/seek/...
functions. */
int (*read_packet)(void *opaque, uint8_t *buf, int buf_size);
int (*write_packet)(void *opaque, uint8_t *buf, int buf_size);
int64_t (*seek)(void *opaque, int64_t offset, int whence);
int64_t pos; /**< position in the file of the current buffer */
int must_flush; /**< true if the next seek should flush */
int eof_reached; /**< true if eof reached */
int write_flag; /**< true if open for writing */
int max_packet_size;
unsigned long checksum;
unsigned char *checksum_ptr;
unsigned long (*update_checksum)(unsigned long checksum, const uint8_t *buf, unsigned int size);
int error; /**< contains the error code or 0 if no error happened */
/**
* Pause or resume playback for network streaming protocols - e.g. MMS.
*/
int (*read_pause)(void *opaque, int pause);
/**
* Seek to a given timestamp in stream with the specified stream_index.
* Needed for some network streaming protocols which don't support seeking
* to byte position.
*/
int64_t (*read_seek)(void *opaque, int stream_index,
int64_t timestamp, int flags);
/**
* A combination of AVIO_SEEKABLE_ flags or 0 when the stream is not seekable.
*/
int seekable;
/**
* max filesize, used to limit allocations
* This field is internal to libavformat and access from outside is not allowed.
*/
int64_t maxsize;
/**
* avio_read and avio_write should if possible be satisfied directly
* instead of going through a buffer, and avio_seek will always
* call the underlying seek function directly.
*/
int direct;
/**
* Bytes read statistic
* This field is internal to libavformat and access from outside is not allowed.
*/
int64_t bytes_read;
/**
* seek statistic
* This field is internal to libavformat and access from outside is not allowed.
*/
int seek_count;
/**
* writeout statistic
* This field is internal to libavformat and access from outside is not allowed.
*/
int writeout_count;
} AVIOContext;
/* unbuffered I/O */
/**
* Return AVIO_FLAG_* access flags corresponding to the access permissions
* of the resource in url, or a negative value corresponding to an
* AVERROR code in case of failure. The returned access flags are
* masked by the value in flags.
*
* @note This function is intrinsically unsafe, in the sense that the
* checked resource may change its existence or permission status from
* one call to another. Thus you should not trust the returned value,
* unless you are sure that no other processes are accessing the
* checked resource.
*/
int avio_check(const char *url, int flags);
/**
* Allocate and initialize an AVIOContext for buffered I/O. It must be later
* freed with av_free().
*
* @param buffer Memory block for input/output operations via AVIOContext.
* The buffer must be allocated with av_malloc() and friends.
* @param buffer_size The buffer size is very important for performance.
* For protocols with fixed blocksize it should be set to this blocksize.
* For others a typical size is a cache page, e.g. 4kb.
* @param write_flag Set to 1 if the buffer should be writable, 0 otherwise.
* @param opaque An opaque pointer to user-specific data.
* @param read_packet A function for refilling the buffer, may be NULL.
* @param write_packet A function for writing the buffer contents, may be NULL.
* The function may not change the input buffers content.
* @param seek A function for seeking to specified byte position, may be NULL.
*
* @return Allocated AVIOContext or NULL on failure.
*/
AVIOContext *avio_alloc_context(
unsigned char *buffer,
int buffer_size,
int write_flag,
void *opaque,
int (*read_packet)(void *opaque, uint8_t *buf, int buf_size),
int (*write_packet)(void *opaque, uint8_t *buf, int buf_size),
int64_t (*seek)(void *opaque, int64_t offset, int whence));
void avio_w8(AVIOContext *s, int b);
void avio_write(AVIOContext *s, const unsigned char *buf, int size);
void avio_wl64(AVIOContext *s, uint64_t val);
void avio_wb64(AVIOContext *s, uint64_t val);
void avio_wl32(AVIOContext *s, unsigned int val);
void avio_wb32(AVIOContext *s, unsigned int val);
void avio_wl24(AVIOContext *s, unsigned int val);
void avio_wb24(AVIOContext *s, unsigned int val);
void avio_wl16(AVIOContext *s, unsigned int val);
void avio_wb16(AVIOContext *s, unsigned int val);
/**
* Write a NULL-terminated string.
* @return number of bytes written.
*/
int avio_put_str(AVIOContext *s, const char *str);
/**
* Convert an UTF-8 string to UTF-16LE and write it.
* @return number of bytes written.
*/
int avio_put_str16le(AVIOContext *s, const char *str);
/**
* Passing this as the "whence" parameter to a seek function causes it to
* return the filesize without seeking anywhere. Supporting this is optional.
* If it is not supported then the seek function will return <0.
*/
#define AVSEEK_SIZE 0x10000
/**
* Oring this flag as into the "whence" parameter to a seek function causes it to
* seek by any means (like reopening and linear reading) or other normally unreasonable
* means that can be extremely slow.
* This may be ignored by the seek code.
*/
#define AVSEEK_FORCE 0x20000
/**
* fseek() equivalent for AVIOContext.
* @return new position or AVERROR.
*/
int64_t avio_seek(AVIOContext *s, int64_t offset, int whence);
/**
* Skip given number of bytes forward
* @return new position or AVERROR.
*/
int64_t avio_skip(AVIOContext *s, int64_t offset);
/**
* ftell() equivalent for AVIOContext.
* @return position or AVERROR.
*/
static av_always_inline int64_t avio_tell(AVIOContext *s)
{
return avio_seek(s, 0, SEEK_CUR);
}
/**
* Get the filesize.
* @return filesize or AVERROR
*/
int64_t avio_size(AVIOContext *s);
/**
* feof() equivalent for AVIOContext.
* @return non zero if and only if end of file
*/
int url_feof(AVIOContext *s);
/** @warning currently size is limited */
int avio_printf(AVIOContext *s, const char *fmt, ...) av_printf_format(2, 3);
/**
* Force flushing of buffered data to the output s.
*
* Force the buffered data to be immediately written to the output,
* without to wait to fill the internal buffer.
*/
void avio_flush(AVIOContext *s);
/**
* Read size bytes from AVIOContext into buf.
* @return number of bytes read or AVERROR
*/
int avio_read(AVIOContext *s, unsigned char *buf, int size);
/**
* @name Functions for reading from AVIOContext
* @{
*
* @note return 0 if EOF, so you cannot use it if EOF handling is
* necessary
*/
int avio_r8 (AVIOContext *s);
unsigned int avio_rl16(AVIOContext *s);
unsigned int avio_rl24(AVIOContext *s);
unsigned int avio_rl32(AVIOContext *s);
uint64_t avio_rl64(AVIOContext *s);
unsigned int avio_rb16(AVIOContext *s);
unsigned int avio_rb24(AVIOContext *s);
unsigned int avio_rb32(AVIOContext *s);
uint64_t avio_rb64(AVIOContext *s);
/**
* @}
*/
/**
* Read a string from pb into buf. The reading will terminate when either
* a NULL character was encountered, maxlen bytes have been read, or nothing
* more can be read from pb. The result is guaranteed to be NULL-terminated, it
* will be truncated if buf is too small.
* Note that the string is not interpreted or validated in any way, it
* might get truncated in the middle of a sequence for multi-byte encodings.
*
* @return number of bytes read (is always <= maxlen).
* If reading ends on EOF or error, the return value will be one more than
* bytes actually read.
*/
int avio_get_str(AVIOContext *pb, int maxlen, char *buf, int buflen);
/**
* Read a UTF-16 string from pb and convert it to UTF-8.
* The reading will terminate when either a null or invalid character was
* encountered or maxlen bytes have been read.
* @return number of bytes read (is always <= maxlen)
*/
int avio_get_str16le(AVIOContext *pb, int maxlen, char *buf, int buflen);
int avio_get_str16be(AVIOContext *pb, int maxlen, char *buf, int buflen);
/**
* @name URL open modes
* The flags argument to avio_open must be one of the following
* constants, optionally ORed with other flags.
* @{
*/
#define AVIO_FLAG_READ 1 /**< read-only */
#define AVIO_FLAG_WRITE 2 /**< write-only */
#define AVIO_FLAG_READ_WRITE (AVIO_FLAG_READ|AVIO_FLAG_WRITE) /**< read-write pseudo flag */
/**
* @}
*/
/**
* Use non-blocking mode.
* If this flag is set, operations on the context will return
* AVERROR(EAGAIN) if they can not be performed immediately.
* If this flag is not set, operations on the context will never return
* AVERROR(EAGAIN).
* Note that this flag does not affect the opening/connecting of the
* context. Connecting a protocol will always block if necessary (e.g. on
* network protocols) but never hang (e.g. on busy devices).
* Warning: non-blocking protocols is work-in-progress; this flag may be
* silently ignored.
*/
#define AVIO_FLAG_NONBLOCK 8
/**
* Use direct mode.
* avio_read and avio_write should if possible be satisfied directly
* instead of going through a buffer, and avio_seek will always
* call the underlying seek function directly.
*/
#define AVIO_FLAG_DIRECT 0x8000
/**
* Create and initialize a AVIOContext for accessing the
* resource indicated by url.
* @note When the resource indicated by url has been opened in
* read+write mode, the AVIOContext can be used only for writing.
*
* @param s Used to return the pointer to the created AVIOContext.
* In case of failure the pointed to value is set to NULL.
* @param flags flags which control how the resource indicated by url
* is to be opened
* @return 0 in case of success, a negative value corresponding to an
* AVERROR code in case of failure
*/
int avio_open(AVIOContext **s, const char *url, int flags);
/**
* Create and initialize a AVIOContext for accessing the
* resource indicated by url.
* @note When the resource indicated by url has been opened in
* read+write mode, the AVIOContext can be used only for writing.
*
* @param s Used to return the pointer to the created AVIOContext.
* In case of failure the pointed to value is set to NULL.
* @param flags flags which control how the resource indicated by url
* is to be opened
* @param int_cb an interrupt callback to be used at the protocols level
* @param options A dictionary filled with protocol-private options. On return
* this parameter will be destroyed and replaced with a dict containing options
* that were not found. May be NULL.
* @return 0 in case of success, a negative value corresponding to an
* AVERROR code in case of failure
*/
int avio_open2(AVIOContext **s, const char *url, int flags,
const AVIOInterruptCB *int_cb, AVDictionary **options);
/**
* Close the resource accessed by the AVIOContext s and free it.
* This function can only be used if s was opened by avio_open().
*
* The internal buffer is automatically flushed before closing the
* resource.
*
* @return 0 on success, an AVERROR < 0 on error.
* @see avio_closep
*/
int avio_close(AVIOContext *s);
/**
* Close the resource accessed by the AVIOContext *s, free it
* and set the pointer pointing to it to NULL.
* This function can only be used if s was opened by avio_open().
*
* The internal buffer is automatically flushed before closing the
* resource.
*
* @return 0 on success, an AVERROR < 0 on error.
* @see avio_close
*/
int avio_closep(AVIOContext **s);
/**
* Open a write only memory stream.
*
* @param s new IO context
* @return zero if no error.
*/
int avio_open_dyn_buf(AVIOContext **s);
/**
* Return the written size and a pointer to the buffer. The buffer
* must be freed with av_free().
* Padding of FF_INPUT_BUFFER_PADDING_SIZE is added to the buffer.
*
* @param s IO context
* @param pbuffer pointer to a byte buffer
* @return the length of the byte buffer
*/
int avio_close_dyn_buf(AVIOContext *s, uint8_t **pbuffer);
/**
* Iterate through names of available protocols.
*
* @param opaque A private pointer representing current protocol.
* It must be a pointer to NULL on first iteration and will
* be updated by successive calls to avio_enum_protocols.
* @param output If set to 1, iterate over output protocols,
* otherwise over input protocols.
*
* @return A static string containing the name of current protocol or NULL
*/
const char *avio_enum_protocols(void **opaque, int output);
/**
* Pause and resume playing - only meaningful if using a network streaming
* protocol (e.g. MMS).
* @param pause 1 for pause, 0 for resume
*/
int avio_pause(AVIOContext *h, int pause);
/**
* Seek to a given timestamp relative to some component stream.
* Only meaningful if using a network streaming protocol (e.g. MMS.).
* @param stream_index The stream index that the timestamp is relative to.
* If stream_index is (-1) the timestamp should be in AV_TIME_BASE
* units from the beginning of the presentation.
* If a stream_index >= 0 is used and the protocol does not support
* seeking based on component streams, the call will fail.
* @param timestamp timestamp in AVStream.time_base units
* or if there is no stream specified then in AV_TIME_BASE units.
* @param flags Optional combination of AVSEEK_FLAG_BACKWARD, AVSEEK_FLAG_BYTE
* and AVSEEK_FLAG_ANY. The protocol may silently ignore
* AVSEEK_FLAG_BACKWARD and AVSEEK_FLAG_ANY, but AVSEEK_FLAG_BYTE will
* fail if used and not supported.
* @return >= 0 on success
* @see AVInputFormat::read_seek
*/
int64_t avio_seek_time(AVIOContext *h, int stream_index,
int64_t timestamp, int flags);
#endif /* AVFORMAT_AVIO_H */

View File

@ -1,82 +0,0 @@
/*
* Version macros.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVFORMAT_VERSION_H
#define AVFORMAT_VERSION_H
/**
* @file
* @ingroup libavf
* Libavformat version macros
*/
#include "libavutil/avutil.h"
#define LIBAVFORMAT_VERSION_MAJOR 55
#define LIBAVFORMAT_VERSION_MINOR 12
#define LIBAVFORMAT_VERSION_MICRO 100
#define LIBAVFORMAT_VERSION_INT AV_VERSION_INT(LIBAVFORMAT_VERSION_MAJOR, \
LIBAVFORMAT_VERSION_MINOR, \
LIBAVFORMAT_VERSION_MICRO)
#define LIBAVFORMAT_VERSION AV_VERSION(LIBAVFORMAT_VERSION_MAJOR, \
LIBAVFORMAT_VERSION_MINOR, \
LIBAVFORMAT_VERSION_MICRO)
#define LIBAVFORMAT_BUILD LIBAVFORMAT_VERSION_INT
#define LIBAVFORMAT_IDENT "Lavf" AV_STRINGIFY(LIBAVFORMAT_VERSION)
/**
* FF_API_* defines may be placed below to indicate public API that will be
* dropped at a future version bump. The defines themselves are not part of
* the public API and may change, break or disappear at any time.
*/
#ifndef FF_API_OLD_AVIO
#define FF_API_OLD_AVIO (LIBAVFORMAT_VERSION_MAJOR < 55)
#endif
#ifndef FF_API_PKT_DUMP
#define FF_API_PKT_DUMP (LIBAVFORMAT_VERSION_MAJOR < 54)
#endif
#ifndef FF_API_ALLOC_OUTPUT_CONTEXT
#define FF_API_ALLOC_OUTPUT_CONTEXT (LIBAVFORMAT_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_FORMAT_PARAMETERS
#define FF_API_FORMAT_PARAMETERS (LIBAVFORMAT_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_NEW_STREAM
#define FF_API_NEW_STREAM (LIBAVFORMAT_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_SET_PTS_INFO
#define FF_API_SET_PTS_INFO (LIBAVFORMAT_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_CLOSE_INPUT_FILE
#define FF_API_CLOSE_INPUT_FILE (LIBAVFORMAT_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_READ_PACKET
#define FF_API_READ_PACKET (LIBAVFORMAT_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_ASS_SSA
#define FF_API_ASS_SSA (LIBAVFORMAT_VERSION_MAJOR < 56)
#endif
#ifndef FF_API_R_FRAME_RATE
#define FF_API_R_FRAME_RATE 1
#endif
#endif /* AVFORMAT_VERSION_H */

View File

@ -1,52 +0,0 @@
/*
* copyright (c) 2006 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_ADLER32_H
#define AVUTIL_ADLER32_H
#include <stdint.h>
#include "attributes.h"
/**
* @defgroup lavu_adler32 Adler32
* @ingroup lavu_crypto
* @{
*/
/**
* Calculate the Adler32 checksum of a buffer.
*
* Passing the return value to a subsequent av_adler32_update() call
* allows the checksum of multiple buffers to be calculated as though
* they were concatenated.
*
* @param adler initial checksum value
* @param buf pointer to input buffer
* @param len size of input buffer
* @return updated checksum
*/
unsigned long av_adler32_update(unsigned long adler, const uint8_t *buf,
unsigned int len) av_pure;
/**
* @}
*/
#endif /* AVUTIL_ADLER32_H */

View File

@ -1,65 +0,0 @@
/*
* copyright (c) 2007 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_AES_H
#define AVUTIL_AES_H
#include <stdint.h>
#include "attributes.h"
#include "version.h"
/**
* @defgroup lavu_aes AES
* @ingroup lavu_crypto
* @{
*/
extern const int av_aes_size;
struct AVAES;
/**
* Allocate an AVAES context.
*/
struct AVAES *av_aes_alloc(void);
/**
* Initialize an AVAES context.
* @param key_bits 128, 192 or 256
* @param decrypt 0 for encryption, 1 for decryption
*/
int av_aes_init(struct AVAES *a, const uint8_t *key, int key_bits, int decrypt);
/**
* Encrypt or decrypt a buffer using a previously initialized context.
* @param count number of 16 byte blocks
* @param dst destination array, can be equal to src
* @param src source array, can be equal to dst
* @param iv initialization vector for CBC mode, if NULL then ECB will be used
* @param decrypt 0 for encryption, 1 for decryption
*/
void av_aes_crypt(struct AVAES *a, uint8_t *dst, const uint8_t *src, int count, uint8_t *iv, int decrypt);
/**
* @}
*/
#endif /* AVUTIL_AES_H */

View File

@ -1,154 +0,0 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* Macro definitions for various function/variable attributes
*/
#ifndef AVUTIL_ATTRIBUTES_H
#define AVUTIL_ATTRIBUTES_H
#ifdef __GNUC__
# define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
#else
# define AV_GCC_VERSION_AT_LEAST(x,y) 0
#endif
#ifndef av_always_inline
#if AV_GCC_VERSION_AT_LEAST(3,1)
# define av_always_inline __attribute__((always_inline)) inline
#elif defined(_MSC_VER)
# define av_always_inline __forceinline
#else
# define av_always_inline inline
#endif
#endif
#ifndef av_extern_inline
#if defined(__ICL) && __ICL >= 1210 || defined(__GNUC_STDC_INLINE__)
# define av_extern_inline extern inline
#else
# define av_extern_inline inline
#endif
#endif
#if AV_GCC_VERSION_AT_LEAST(3,1)
# define av_noinline __attribute__((noinline))
#else
# define av_noinline
#endif
#if AV_GCC_VERSION_AT_LEAST(3,1)
# define av_pure __attribute__((pure))
#else
# define av_pure
#endif
#ifndef av_restrict
#define av_restrict restrict
#endif
#if AV_GCC_VERSION_AT_LEAST(2,6)
# define av_const __attribute__((const))
#else
# define av_const
#endif
#if AV_GCC_VERSION_AT_LEAST(4,3)
# define av_cold __attribute__((cold))
#else
# define av_cold
#endif
#if AV_GCC_VERSION_AT_LEAST(4,1)
# define av_flatten __attribute__((flatten))
#else
# define av_flatten
#endif
#if AV_GCC_VERSION_AT_LEAST(3,1)
# define attribute_deprecated __attribute__((deprecated))
#else
# define attribute_deprecated
#endif
/**
* Disable warnings about deprecated features
* This is useful for sections of code kept for backward compatibility and
* scheduled for removal.
*/
#ifndef AV_NOWARN_DEPRECATED
#if AV_GCC_VERSION_AT_LEAST(4,6)
# define AV_NOWARN_DEPRECATED(code) \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") \
code \
_Pragma("GCC diagnostic pop")
#else
# define AV_NOWARN_DEPRECATED(code) code
#endif
#endif
#if defined(__GNUC__)
# define av_unused __attribute__((unused))
#else
# define av_unused
#endif
/**
* Mark a variable as used and prevent the compiler from optimizing it
* away. This is useful for variables accessed only from inline
* assembler without the compiler being aware.
*/
#if AV_GCC_VERSION_AT_LEAST(3,1)
# define av_used __attribute__((used))
#else
# define av_used
#endif
#if AV_GCC_VERSION_AT_LEAST(3,3)
# define av_alias __attribute__((may_alias))
#else
# define av_alias
#endif
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
# define av_uninit(x) x=x
#else
# define av_uninit(x) x
#endif
#ifdef __GNUC__
# define av_builtin_constant_p __builtin_constant_p
# define av_printf_format(fmtpos, attrpos) __attribute__((__format__(__printf__, fmtpos, attrpos)))
#else
# define av_builtin_constant_p(x) 0
# define av_printf_format(fmtpos, attrpos)
#endif
#if AV_GCC_VERSION_AT_LEAST(2,5)
# define av_noreturn __attribute__((noreturn))
#else
# define av_noreturn
#endif
#endif /* AVUTIL_ATTRIBUTES_H */

View File

@ -1,149 +0,0 @@
/*
* Audio FIFO
* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* Audio FIFO Buffer
*/
#ifndef AVUTIL_AUDIO_FIFO_H
#define AVUTIL_AUDIO_FIFO_H
#include "avutil.h"
#include "fifo.h"
#include "samplefmt.h"
/**
* @addtogroup lavu_audio
* @{
*/
/**
* Context for an Audio FIFO Buffer.
*
* - Operates at the sample level rather than the byte level.
* - Supports multiple channels with either planar or packed sample format.
* - Automatic reallocation when writing to a full buffer.
*/
typedef struct AVAudioFifo AVAudioFifo;
/**
* Free an AVAudioFifo.
*
* @param af AVAudioFifo to free
*/
void av_audio_fifo_free(AVAudioFifo *af);
/**
* Allocate an AVAudioFifo.
*
* @param sample_fmt sample format
* @param channels number of channels
* @param nb_samples initial allocation size, in samples
* @return newly allocated AVAudioFifo, or NULL on error
*/
AVAudioFifo *av_audio_fifo_alloc(enum AVSampleFormat sample_fmt, int channels,
int nb_samples);
/**
* Reallocate an AVAudioFifo.
*
* @param af AVAudioFifo to reallocate
* @param nb_samples new allocation size, in samples
* @return 0 if OK, or negative AVERROR code on failure
*/
int av_audio_fifo_realloc(AVAudioFifo *af, int nb_samples);
/**
* Write data to an AVAudioFifo.
*
* The AVAudioFifo will be reallocated automatically if the available space
* is less than nb_samples.
*
* @see enum AVSampleFormat
* The documentation for AVSampleFormat describes the data layout.
*
* @param af AVAudioFifo to write to
* @param data audio data plane pointers
* @param nb_samples number of samples to write
* @return number of samples actually written, or negative AVERROR
* code on failure. If successful, the number of samples
* actually written will always be nb_samples.
*/
int av_audio_fifo_write(AVAudioFifo *af, void **data, int nb_samples);
/**
* Read data from an AVAudioFifo.
*
* @see enum AVSampleFormat
* The documentation for AVSampleFormat describes the data layout.
*
* @param af AVAudioFifo to read from
* @param data audio data plane pointers
* @param nb_samples number of samples to read
* @return number of samples actually read, or negative AVERROR code
* on failure. The number of samples actually read will not
* be greater than nb_samples, and will only be less than
* nb_samples if av_audio_fifo_size is less than nb_samples.
*/
int av_audio_fifo_read(AVAudioFifo *af, void **data, int nb_samples);
/**
* Drain data from an AVAudioFifo.
*
* Removes the data without reading it.
*
* @param af AVAudioFifo to drain
* @param nb_samples number of samples to drain
* @return 0 if OK, or negative AVERROR code on failure
*/
int av_audio_fifo_drain(AVAudioFifo *af, int nb_samples);
/**
* Reset the AVAudioFifo buffer.
*
* This empties all data in the buffer.
*
* @param af AVAudioFifo to reset
*/
void av_audio_fifo_reset(AVAudioFifo *af);
/**
* Get the current number of samples in the AVAudioFifo available for reading.
*
* @param af the AVAudioFifo to query
* @return number of samples available for reading
*/
int av_audio_fifo_size(AVAudioFifo *af);
/**
* Get the current number of samples in the AVAudioFifo available for writing.
*
* @param af the AVAudioFifo to query
* @return number of samples available for writing
*/
int av_audio_fifo_space(AVAudioFifo *af);
/**
* @}
*/
#endif /* AVUTIL_AUDIO_FIFO_H */

View File

@ -1,6 +0,0 @@
#include "version.h"
#if FF_API_AUDIOCONVERT
#include "channel_layout.h"
#endif

View File

@ -1,66 +0,0 @@
/*
* copyright (c) 2010 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* simple assert() macros that are a bit more flexible than ISO C assert().
* @author Michael Niedermayer <michaelni@gmx.at>
*/
#ifndef AVUTIL_AVASSERT_H
#define AVUTIL_AVASSERT_H
#include <stdlib.h>
#include "avutil.h"
#include "log.h"
/**
* assert() equivalent, that is always enabled.
*/
#define av_assert0(cond) do { \
if (!(cond)) { \
av_log(NULL, AV_LOG_PANIC, "Assertion %s failed at %s:%d\n", \
AV_STRINGIFY(cond), __FILE__, __LINE__); \
abort(); \
} \
} while (0)
/**
* assert() equivalent, that does not lie in speed critical code.
* These asserts() thus can be enabled without fearing speedloss.
*/
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 0
#define av_assert1(cond) av_assert0(cond)
#else
#define av_assert1(cond) ((void)0)
#endif
/**
* assert() equivalent, that does lie in speed critical code.
*/
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
#define av_assert2(cond) av_assert0(cond)
#else
#define av_assert2(cond) ((void)0)
#endif
#endif /* AVUTIL_AVASSERT_H */

View File

@ -1,8 +0,0 @@
/* Generated by ffconf */
#ifndef AVUTIL_AVCONFIG_H
#define AVUTIL_AVCONFIG_H
#define AV_HAVE_BIGENDIAN 0
#define AV_HAVE_FAST_UNALIGNED 1
#define AV_HAVE_INCOMPATIBLE_LIBAV_ABI 0
#define AV_HAVE_INCOMPATIBLE_FORK_ABI 0
#endif /* AVUTIL_AVCONFIG_H */

View File

@ -1,302 +0,0 @@
/*
* Copyright (c) 2007 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_AVSTRING_H
#define AVUTIL_AVSTRING_H
#include <stddef.h>
#include "attributes.h"
/**
* @addtogroup lavu_string
* @{
*/
/**
* Return non-zero if pfx is a prefix of str. If it is, *ptr is set to
* the address of the first character in str after the prefix.
*
* @param str input string
* @param pfx prefix to test
* @param ptr updated if the prefix is matched inside str
* @return non-zero if the prefix matches, zero otherwise
*/
int av_strstart(const char *str, const char *pfx, const char **ptr);
/**
* Return non-zero if pfx is a prefix of str independent of case. If
* it is, *ptr is set to the address of the first character in str
* after the prefix.
*
* @param str input string
* @param pfx prefix to test
* @param ptr updated if the prefix is matched inside str
* @return non-zero if the prefix matches, zero otherwise
*/
int av_stristart(const char *str, const char *pfx, const char **ptr);
/**
* Locate the first case-independent occurrence in the string haystack
* of the string needle. A zero-length string needle is considered to
* match at the start of haystack.
*
* This function is a case-insensitive version of the standard strstr().
*
* @param haystack string to search in
* @param needle string to search for
* @return pointer to the located match within haystack
* or a null pointer if no match
*/
char *av_stristr(const char *haystack, const char *needle);
/**
* Locate the first occurrence of the string needle in the string haystack
* where not more than hay_length characters are searched. A zero-length
* string needle is considered to match at the start of haystack.
*
* This function is a length-limited version of the standard strstr().
*
* @param haystack string to search in
* @param needle string to search for
* @param hay_length length of string to search in
* @return pointer to the located match within haystack
* or a null pointer if no match
*/
char *av_strnstr(const char *haystack, const char *needle, size_t hay_length);
/**
* Copy the string src to dst, but no more than size - 1 bytes, and
* null-terminate dst.
*
* This function is the same as BSD strlcpy().
*
* @param dst destination buffer
* @param src source string
* @param size size of destination buffer
* @return the length of src
*
* @warning since the return value is the length of src, src absolutely
* _must_ be a properly 0-terminated string, otherwise this will read beyond
* the end of the buffer and possibly crash.
*/
size_t av_strlcpy(char *dst, const char *src, size_t size);
/**
* Append the string src to the string dst, but to a total length of
* no more than size - 1 bytes, and null-terminate dst.
*
* This function is similar to BSD strlcat(), but differs when
* size <= strlen(dst).
*
* @param dst destination buffer
* @param src source string
* @param size size of destination buffer
* @return the total length of src and dst
*
* @warning since the return value use the length of src and dst, these
* absolutely _must_ be a properly 0-terminated strings, otherwise this
* will read beyond the end of the buffer and possibly crash.
*/
size_t av_strlcat(char *dst, const char *src, size_t size);
/**
* Append output to a string, according to a format. Never write out of
* the destination buffer, and always put a terminating 0 within
* the buffer.
* @param dst destination buffer (string to which the output is
* appended)
* @param size total size of the destination buffer
* @param fmt printf-compatible format string, specifying how the
* following parameters are used
* @return the length of the string that would have been generated
* if enough space had been available
*/
size_t av_strlcatf(char *dst, size_t size, const char *fmt, ...) av_printf_format(3, 4);
/**
* Print arguments following specified format into a large enough auto
* allocated buffer. It is similar to GNU asprintf().
* @param fmt printf-compatible format string, specifying how the
* following parameters are used.
* @return the allocated string
* @note You have to free the string yourself with av_free().
*/
char *av_asprintf(const char *fmt, ...) av_printf_format(1, 2);
/**
* Convert a number to a av_malloced string.
*/
char *av_d2str(double d);
/**
* Unescape the given string until a non escaped terminating char,
* and return the token corresponding to the unescaped string.
*
* The normal \ and ' escaping is supported. Leading and trailing
* whitespaces are removed, unless they are escaped with '\' or are
* enclosed between ''.
*
* @param buf the buffer to parse, buf will be updated to point to the
* terminating char
* @param term a 0-terminated list of terminating chars
* @return the malloced unescaped string, which must be av_freed by
* the user, NULL in case of allocation failure
*/
char *av_get_token(const char **buf, const char *term);
/**
* Split the string into several tokens which can be accessed by
* successive calls to av_strtok().
*
* A token is defined as a sequence of characters not belonging to the
* set specified in delim.
*
* On the first call to av_strtok(), s should point to the string to
* parse, and the value of saveptr is ignored. In subsequent calls, s
* should be NULL, and saveptr should be unchanged since the previous
* call.
*
* This function is similar to strtok_r() defined in POSIX.1.
*
* @param s the string to parse, may be NULL
* @param delim 0-terminated list of token delimiters, must be non-NULL
* @param saveptr user-provided pointer which points to stored
* information necessary for av_strtok() to continue scanning the same
* string. saveptr is updated to point to the next character after the
* first delimiter found, or to NULL if the string was terminated
* @return the found token, or NULL when no token is found
*/
char *av_strtok(char *s, const char *delim, char **saveptr);
/**
* Locale-independent conversion of ASCII isdigit.
*/
int av_isdigit(int c);
/**
* Locale-independent conversion of ASCII isgraph.
*/
int av_isgraph(int c);
/**
* Locale-independent conversion of ASCII isspace.
*/
int av_isspace(int c);
/**
* Locale-independent conversion of ASCII characters to uppercase.
*/
static inline int av_toupper(int c)
{
if (c >= 'a' && c <= 'z')
c ^= 0x20;
return c;
}
/**
* Locale-independent conversion of ASCII characters to lowercase.
*/
static inline int av_tolower(int c)
{
if (c >= 'A' && c <= 'Z')
c ^= 0x20;
return c;
}
/**
* Locale-independent conversion of ASCII isxdigit.
*/
int av_isxdigit(int c);
/**
* Locale-independent case-insensitive compare.
* @note This means only ASCII-range characters are case-insensitive
*/
int av_strcasecmp(const char *a, const char *b);
/**
* Locale-independent case-insensitive compare.
* @note This means only ASCII-range characters are case-insensitive
*/
int av_strncasecmp(const char *a, const char *b, size_t n);
/**
* Thread safe basename.
* @param path the path, on DOS both \ and / are considered separators.
* @return pointer to the basename substring.
*/
const char *av_basename(const char *path);
/**
* Thread safe dirname.
* @param path the path, on DOS both \ and / are considered separators.
* @return the path with the separator replaced by the string terminator or ".".
* @note the function may change the input string.
*/
const char *av_dirname(char *path);
enum AVEscapeMode {
AV_ESCAPE_MODE_AUTO, ///< Use auto-selected escaping mode.
AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
AV_ESCAPE_MODE_QUOTE, ///< Use single-quote escaping.
};
/**
* Consider spaces special and escape them even in the middle of the
* string.
*
* This is equivalent to adding the whitespace characters to the special
* characters lists, except it is guaranteed to use the exact same list
* of whitespace characters as the rest of libavutil.
*/
#define AV_ESCAPE_FLAG_WHITESPACE 0x01
/**
* Escape only specified special characters.
* Without this flag, escape also any characters that may be considered
* special by av_get_token(), such as the single quote.
*/
#define AV_ESCAPE_FLAG_STRICT 0x02
/**
* Escape string in src, and put the escaped string in an allocated
* string in *dst, which must be freed with av_free().
*
* @param dst pointer where an allocated string is put
* @param src string to escape, must be non-NULL
* @param special_chars string containing the special characters which
* need to be escaped, can be NULL
* @param mode escape mode to employ, see AV_ESCAPE_MODE_* macros.
* Any unknown value for mode will be considered equivalent to
* AV_ESCAPE_MODE_BACKSLASH, but this behaviour can change without
* notice.
* @param flags flags which control how to escape, see AV_ESCAPE_FLAG_ macros
* @return the length of the allocated string, or a negative error code in case of error
* @see av_bprint_escape()
*/
int av_escape(char **dst, const char *src, const char *special_chars,
enum AVEscapeMode mode, int flags);
/**
* @}
*/
#endif /* AVUTIL_AVSTRING_H */

View File

@ -1,314 +0,0 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_AVUTIL_H
#define AVUTIL_AVUTIL_H
/**
* @file
* external API header
*/
/**
* @mainpage
*
* @section ffmpeg_intro Introduction
*
* This document describes the usage of the different libraries
* provided by FFmpeg.
*
* @li @ref libavc "libavcodec" encoding/decoding library
* @li @ref lavfi "libavfilter" graph-based frame editing library
* @li @ref libavf "libavformat" I/O and muxing/demuxing library
* @li @ref lavd "libavdevice" special devices muxing/demuxing library
* @li @ref lavu "libavutil" common utility library
* @li @ref lswr "libswresample" audio resampling, format conversion and mixing
* @li @ref lpp "libpostproc" post processing library
* @li @ref lsws "libswscale" color conversion and scaling library
*
* @section ffmpeg_versioning Versioning and compatibility
*
* Each of the FFmpeg libraries contains a version.h header, which defines a
* major, minor and micro version number with the
* <em>LIBRARYNAME_VERSION_{MAJOR,MINOR,MICRO}</em> macros. The major version
* number is incremented with backward incompatible changes - e.g. removing
* parts of the public API, reordering public struct members, etc. The minor
* version number is incremented for backward compatible API changes or major
* new features - e.g. adding a new public function or a new decoder. The micro
* version number is incremented for smaller changes that a calling program
* might still want to check for - e.g. changing behavior in a previously
* unspecified situation.
*
* FFmpeg guarantees backward API and ABI compatibility for each library as long
* as its major version number is unchanged. This means that no public symbols
* will be removed or renamed. Types and names of the public struct members and
* values of public macros and enums will remain the same (unless they were
* explicitly declared as not part of the public API). Documented behavior will
* not change.
*
* In other words, any correct program that works with a given FFmpeg snapshot
* should work just as well without any changes with any later snapshot with the
* same major versions. This applies to both rebuilding the program against new
* FFmpeg versions or to replacing the dynamic FFmpeg libraries that a program
* links against.
*
* However, new public symbols may be added and new members may be appended to
* public structs whose size is not part of public ABI (most public structs in
* FFmpeg). New macros and enum values may be added. Behavior in undocumented
* situations may change slightly (and be documented). All those are accompanied
* by an entry in doc/APIchanges and incrementing either the minor or micro
* version number.
*/
/**
* @defgroup lavu Common utility functions
*
* @brief
* libavutil contains the code shared across all the other FFmpeg
* libraries
*
* @note In order to use the functions provided by avutil you must include
* the specific header.
*
* @{
*
* @defgroup lavu_crypto Crypto and Hashing
*
* @{
* @}
*
* @defgroup lavu_math Maths
* @{
*
* @}
*
* @defgroup lavu_string String Manipulation
*
* @{
*
* @}
*
* @defgroup lavu_mem Memory Management
*
* @{
*
* @}
*
* @defgroup lavu_data Data Structures
* @{
*
* @}
*
* @defgroup lavu_audio Audio related
*
* @{
*
* @}
*
* @defgroup lavu_error Error Codes
*
* @{
*
* @}
*
* @defgroup lavu_misc Other
*
* @{
*
* @defgroup lavu_internal Internal
*
* Not exported functions, for internal usage only
*
* @{
*
* @}
*/
/**
* @addtogroup lavu_ver
* @{
*/
/**
* Return the LIBAVUTIL_VERSION_INT constant.
*/
unsigned avutil_version(void);
/**
* Return the libavutil build-time configuration.
*/
const char *avutil_configuration(void);
/**
* Return the libavutil license.
*/
const char *avutil_license(void);
/**
* @}
*/
/**
* @addtogroup lavu_media Media Type
* @brief Media Type
*/
enum AVMediaType {
AVMEDIA_TYPE_UNKNOWN = -1, ///< Usually treated as AVMEDIA_TYPE_DATA
AVMEDIA_TYPE_VIDEO,
AVMEDIA_TYPE_AUDIO,
AVMEDIA_TYPE_DATA, ///< Opaque data information usually continuous
AVMEDIA_TYPE_SUBTITLE,
AVMEDIA_TYPE_ATTACHMENT, ///< Opaque data information usually sparse
AVMEDIA_TYPE_NB
};
/**
* Return a string describing the media_type enum, NULL if media_type
* is unknown.
*/
const char *av_get_media_type_string(enum AVMediaType media_type);
/**
* @defgroup lavu_const Constants
* @{
*
* @defgroup lavu_enc Encoding specific
*
* @note those definition should move to avcodec
* @{
*/
#define FF_LAMBDA_SHIFT 7
#define FF_LAMBDA_SCALE (1<<FF_LAMBDA_SHIFT)
#define FF_QP2LAMBDA 118 ///< factor to convert from H.263 QP to lambda
#define FF_LAMBDA_MAX (256*128-1)
#define FF_QUALITY_SCALE FF_LAMBDA_SCALE //FIXME maybe remove
/**
* @}
* @defgroup lavu_time Timestamp specific
*
* FFmpeg internal timebase and timestamp definitions
*
* @{
*/
/**
* @brief Undefined timestamp value
*
* Usually reported by demuxer that work on containers that do not provide
* either pts or dts.
*/
#define AV_NOPTS_VALUE ((int64_t)UINT64_C(0x8000000000000000))
/**
* Internal time base represented as integer
*/
#define AV_TIME_BASE 1000000
/**
* Internal time base represented as fractional value
*/
#define AV_TIME_BASE_Q (AVRational){1, AV_TIME_BASE}
/**
* @}
* @}
* @defgroup lavu_picture Image related
*
* AVPicture types, pixel formats and basic image planes manipulation.
*
* @{
*/
enum AVPictureType {
AV_PICTURE_TYPE_NONE = 0, ///< Undefined
AV_PICTURE_TYPE_I, ///< Intra
AV_PICTURE_TYPE_P, ///< Predicted
AV_PICTURE_TYPE_B, ///< Bi-dir predicted
AV_PICTURE_TYPE_S, ///< S(GMC)-VOP MPEG4
AV_PICTURE_TYPE_SI, ///< Switching Intra
AV_PICTURE_TYPE_SP, ///< Switching Predicted
AV_PICTURE_TYPE_BI, ///< BI type
};
/**
* Return a single letter to describe the given picture type
* pict_type.
*
* @param[in] pict_type the picture type @return a single character
* representing the picture type, '?' if pict_type is unknown
*/
char av_get_picture_type_char(enum AVPictureType pict_type);
/**
* @}
*/
#include "common.h"
#include "error.h"
#include "version.h"
#include "mathematics.h"
#include "rational.h"
#include "intfloat_readwrite.h"
#include "log.h"
#include "pixfmt.h"
/**
* Return x default pointer in case p is NULL.
*/
static inline void *av_x_if_null(const void *p, const void *x)
{
return (void *)(intptr_t)(p ? p : x);
}
/**
* Compute the length of an integer list.
*
* @param elsize size in bytes of each list element (only 1, 2, 4 or 8)
* @param term list terminator (usually 0 or -1)
* @param list pointer to the list
* @return length of the list, in elements, not counting the terminator
*/
unsigned av_int_list_length_for_size(unsigned elsize,
const void *list, uint64_t term) av_pure;
/**
* Compute the length of an integer list.
*
* @param term list terminator (usually 0 or -1)
* @param list pointer to the list
* @return length of the list, in elements, not counting the terminator
*/
#define av_int_list_length(list, term) \
av_int_list_length_for_size(sizeof(*(list)), list, term)
/**
* @}
* @}
*/
#endif /* AVUTIL_AVUTIL_H */

View File

@ -1,67 +0,0 @@
/*
* Copyright (c) 2006 Ryan Martell. (rdm4@martellventures.com)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_BASE64_H
#define AVUTIL_BASE64_H
#include <stdint.h>
/**
* @defgroup lavu_base64 Base64
* @ingroup lavu_crypto
* @{
*/
/**
* Decode a base64-encoded string.
*
* @param out buffer for decoded data
* @param in null-terminated input string
* @param out_size size in bytes of the out buffer, must be at
* least 3/4 of the length of in
* @return number of bytes written, or a negative value in case of
* invalid input
*/
int av_base64_decode(uint8_t *out, const char *in, int out_size);
/**
* Encode data to base64 and null-terminate.
*
* @param out buffer for encoded data
* @param out_size size in bytes of the out buffer (including the
* null terminator), must be at least AV_BASE64_SIZE(in_size)
* @param in input buffer containing the data to encode
* @param in_size size in bytes of the in buffer
* @return out or NULL in case of error
*/
char *av_base64_encode(char *out, int out_size, const uint8_t *in, int in_size);
/**
* Calculate the output size needed to base64-encode x bytes to a
* null-terminated string.
*/
#define AV_BASE64_SIZE(x) (((x)+2) / 3 * 4 + 1)
/**
* @}
*/
#endif /* AVUTIL_BASE64_H */

View File

@ -1,77 +0,0 @@
/*
* Blowfish algorithm
* Copyright (c) 2012 Samuel Pitoiset
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_BLOWFISH_H
#define AVUTIL_BLOWFISH_H
#include <stdint.h>
/**
* @defgroup lavu_blowfish Blowfish
* @ingroup lavu_crypto
* @{
*/
#define AV_BF_ROUNDS 16
typedef struct AVBlowfish {
uint32_t p[AV_BF_ROUNDS + 2];
uint32_t s[4][256];
} AVBlowfish;
/**
* Initialize an AVBlowfish context.
*
* @param ctx an AVBlowfish context
* @param key a key
* @param key_len length of the key
*/
void av_blowfish_init(struct AVBlowfish *ctx, const uint8_t *key, int key_len);
/**
* Encrypt or decrypt a buffer using a previously initialized context.
*
* @param ctx an AVBlowfish context
* @param xl left four bytes halves of input to be encrypted
* @param xr right four bytes halves of input to be encrypted
* @param decrypt 0 for encryption, 1 for decryption
*/
void av_blowfish_crypt_ecb(struct AVBlowfish *ctx, uint32_t *xl, uint32_t *xr,
int decrypt);
/**
* Encrypt or decrypt a buffer using a previously initialized context.
*
* @param ctx an AVBlowfish context
* @param dst destination array, can be equal to src
* @param src source array, can be equal to dst
* @param count number of 8 byte blocks
* @param iv initialization vector for CBC mode, if NULL ECB will be used
* @param decrypt 0 for encryption, 1 for decryption
*/
void av_blowfish_crypt(struct AVBlowfish *ctx, uint8_t *dst, const uint8_t *src,
int count, uint8_t *iv, int decrypt);
/**
* @}
*/
#endif /* AVUTIL_BLOWFISH_H */

Some files were not shown because too many files have changed in this diff Show More