mirror of
https://github.com/opencv/opencv.git
synced 2025-06-19 00:46:39 +08:00
commit
5ebdf6cedd
30
.github/ISSUE_TEMPLATE.md
vendored
Normal file
30
.github/ISSUE_TEMPLATE.md
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
<!--
|
||||
If you have a question rather than reporting a bug please go to http://answers.opencv.org where you get much faster responses.
|
||||
If you need further assistance please read [How To Contribute](https://github.com/opencv/opencv/wiki/How_to_contribute).
|
||||
|
||||
This is a template helping you to create an issue which can be processed as quickly as possible. This is the bug reporting section for the OpenCV library.
|
||||
-->
|
||||
|
||||
##### System information (version)
|
||||
<!-- Example
|
||||
- OpenCV => 3.1
|
||||
- Operating System / Platform => Windows 64 Bit
|
||||
- Compiler => Visual Studio 2015
|
||||
-->
|
||||
|
||||
- OpenCV => :grey_question:
|
||||
- Operating System / Platform => :grey_question:
|
||||
- Compiler => :grey_question:
|
||||
|
||||
##### Detailed description
|
||||
|
||||
<!-- your description -->
|
||||
|
||||
##### Steps to reproduce
|
||||
|
||||
<!-- to add code example fence it with triple backticks and optional file extension
|
||||
```.cpp
|
||||
// C++ code example
|
||||
```
|
||||
or attach as .txt or .zip file
|
||||
-->
|
9
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
9
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
<!-- Please use this line to close one or multiple issues when this pullrequest gets merged
|
||||
You can add another line right under the first one:
|
||||
resolves #1234
|
||||
resolves #1235
|
||||
-->
|
||||
|
||||
### This pullrequest changes
|
||||
|
||||
<!-- Please describe what your pullrequest is changing -->
|
15
.gitignore
vendored
15
.gitignore
vendored
@ -8,3 +8,18 @@
|
||||
Thumbs.db
|
||||
tags
|
||||
tegra/
|
||||
bin/
|
||||
*.sdf
|
||||
*.opensdf
|
||||
*.obj
|
||||
*.stamp
|
||||
*.depend
|
||||
*.rule
|
||||
*.tmp
|
||||
*/debug
|
||||
*/CMakeFiles
|
||||
CMakeCache.txt
|
||||
*.suo
|
||||
*.log
|
||||
*.tlog
|
||||
build
|
||||
|
8
3rdparty/carotene/.gitignore
vendored
Normal file
8
3rdparty/carotene/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
# Gedit temp files
|
||||
*~
|
||||
|
||||
# Qt Creator file
|
||||
*.user
|
||||
|
||||
# MacOS-specific (Desktop Services Store)
|
||||
.DS_Store
|
42
3rdparty/carotene/CMakeLists.txt
vendored
Normal file
42
3rdparty/carotene/CMakeLists.txt
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
|
||||
|
||||
project(Carotene)
|
||||
|
||||
set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions")
|
||||
|
||||
set(CAROTENE_INCLUDE_DIR include)
|
||||
set(CAROTENE_SOURCE_DIR src)
|
||||
|
||||
file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp")
|
||||
file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp"
|
||||
"${CAROTENE_SOURCE_DIR}/*.hpp")
|
||||
|
||||
include_directories(${CAROTENE_INCLUDE_DIR})
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}")
|
||||
|
||||
# allow more inlines - these parameters improve performance for:
|
||||
# - matchTemplate about 5-10%
|
||||
# - goodFeaturesToTrack 10-20%
|
||||
# - cornerHarris 30% for some cases
|
||||
|
||||
set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
|
||||
endif()
|
||||
|
||||
add_library(carotene_objs OBJECT
|
||||
${carotene_headers}
|
||||
${carotene_sources}
|
||||
)
|
||||
|
||||
if(NOT CAROTENE_NS STREQUAL "carotene")
|
||||
target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}")
|
||||
endif()
|
||||
|
||||
if(WITH_NEON)
|
||||
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
|
||||
endif()
|
||||
|
||||
set_target_properties(carotene_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
|
||||
|
||||
add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")
|
2
3rdparty/carotene/README.md
vendored
Normal file
2
3rdparty/carotene/README.md
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
This is Carotene, a low-level library containing optimized CPU routines
|
||||
that are useful for computer vision algorithms.
|
114
3rdparty/carotene/hal/CMakeLists.txt
vendored
Normal file
114
3rdparty/carotene/hal/CMakeLists.txt
vendored
Normal file
@ -0,0 +1,114 @@
|
||||
cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
|
||||
|
||||
include(CheckCCompilerFlag)
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../")
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
|
||||
set(ARM TRUE)
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*")
|
||||
set(AARCH64 TRUE)
|
||||
endif()
|
||||
|
||||
set(TEGRA_COMPILER_FLAGS "")
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
# Generate unwind information even for functions that can't throw/propagate exceptions.
|
||||
# This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols.
|
||||
list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables)
|
||||
endif()
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6"))
|
||||
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
|
||||
elseif(CMAKE_COMPILER_IS_CLANGCXX)
|
||||
list(APPEND TEGRA_COMPILER_FLAGS -fwrapv)
|
||||
else()
|
||||
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces
|
||||
-fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
|
||||
endif()
|
||||
if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" )
|
||||
list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange
|
||||
-floop-strip-mine -floop-parallelize-all -ftree-loop-linear)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}")
|
||||
|
||||
if(ARMEABI_V7A)
|
||||
if (CMAKE_COMPILER_IS_GNUCXX)
|
||||
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" )
|
||||
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" )
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_LOGS)
|
||||
add_definitions(-DHAVE_LOGS)
|
||||
endif()
|
||||
|
||||
set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE)
|
||||
|
||||
function(compile_carotene)
|
||||
if(ENABLE_NEON)
|
||||
set(WITH_NEON ON)
|
||||
endif()
|
||||
|
||||
add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene")
|
||||
|
||||
if(ARM OR AARCH64)
|
||||
if(CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
|
||||
endif()
|
||||
check_cxx_compiler_flag("-mfpu=neon" CXX_HAS_MFPU_NEON)
|
||||
check_c_compiler_flag("-mfpu=neon" C_HAS_MFPU_NEON)
|
||||
if(${CXX_HAS_MFPU_NEON} AND ${C_HAS_MFPU_NEON})
|
||||
get_target_property(old_flags "carotene_objs" COMPILE_FLAGS)
|
||||
if(old_flags)
|
||||
set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "${old_flags} -mfpu=neon")
|
||||
else()
|
||||
set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "-mfpu=neon")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
compile_carotene()
|
||||
|
||||
include_directories("${CAROTENE_DIR}/include")
|
||||
|
||||
get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS)
|
||||
set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
|
||||
|
||||
if (CMAKE_COMPILER_IS_GNUCXX)
|
||||
# allow more inlines - these parameters improve performance for:
|
||||
# matchTemplate about 5-10%
|
||||
# goodFeaturesToTrack 10-20%
|
||||
# cornerHarris 30% for some cases
|
||||
set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
|
||||
# set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
|
||||
endif()
|
||||
|
||||
add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
|
||||
set_target_properties(tegra_hal PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
|
||||
set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
|
||||
set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
|
||||
if(NOT BUILD_SHARED_LIBS)
|
||||
ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
|
||||
endif()
|
||||
target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
|
||||
|
||||
set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE)
|
||||
set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE)
|
||||
set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE)
|
||||
set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE)
|
||||
|
||||
configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY)
|
||||
configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY)
|
||||
configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY)
|
||||
configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY)
|
1851
3rdparty/carotene/hal/tegra_hal.hpp
vendored
Normal file
1851
3rdparty/carotene/hal/tegra_hal.hpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
47
3rdparty/carotene/include/carotene/definitions.hpp
vendored
Normal file
47
3rdparty/carotene/include/carotene/definitions.hpp
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#ifndef CAROTENE_DEFINITIONS_HPP
|
||||
#define CAROTENE_DEFINITIONS_HPP
|
||||
|
||||
#ifndef CAROTENE_NS
|
||||
#define CAROTENE_NS carotene
|
||||
#endif
|
||||
|
||||
#endif
|
2492
3rdparty/carotene/include/carotene/functions.hpp
vendored
Normal file
2492
3rdparty/carotene/include/carotene/functions.hpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
125
3rdparty/carotene/include/carotene/types.hpp
vendored
Normal file
125
3rdparty/carotene/include/carotene/types.hpp
vendored
Normal file
@ -0,0 +1,125 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#ifndef CAROTENE_TYPES_HPP
|
||||
#define CAROTENE_TYPES_HPP
|
||||
|
||||
#include <carotene/definitions.hpp>
|
||||
#include <stdint.h>
|
||||
#include <cstddef>
|
||||
|
||||
#ifndef UINT32_MAX
|
||||
#define UINT32_MAX (4294967295U)
|
||||
#endif
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
using std::size_t;
|
||||
using std::ptrdiff_t;
|
||||
|
||||
typedef int8_t s8;
|
||||
typedef uint8_t u8;
|
||||
typedef int16_t s16;
|
||||
typedef uint16_t u16;
|
||||
typedef int32_t s32;
|
||||
typedef uint32_t u32;
|
||||
typedef float f32;
|
||||
typedef int64_t s64;
|
||||
typedef uint64_t u64;
|
||||
typedef double f64;
|
||||
|
||||
typedef ptrdiff_t stride_t;
|
||||
|
||||
enum CONVERT_POLICY
|
||||
{
|
||||
CONVERT_POLICY_WRAP,
|
||||
CONVERT_POLICY_SATURATE
|
||||
};
|
||||
|
||||
enum BORDER_MODE
|
||||
{
|
||||
BORDER_MODE_UNDEFINED,
|
||||
BORDER_MODE_CONSTANT,
|
||||
BORDER_MODE_REPLICATE,
|
||||
BORDER_MODE_REFLECT,
|
||||
BORDER_MODE_REFLECT101,
|
||||
BORDER_MODE_WRAP
|
||||
};
|
||||
|
||||
enum FLIP_MODE
|
||||
{
|
||||
FLIP_HORIZONTAL_MODE = 1,
|
||||
FLIP_VERTICAL_MODE = 2,
|
||||
FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE
|
||||
};
|
||||
|
||||
enum COLOR_SPACE
|
||||
{
|
||||
COLOR_SPACE_BT601,
|
||||
COLOR_SPACE_BT709
|
||||
};
|
||||
|
||||
struct Size2D {
|
||||
Size2D() : width(0), height(0) {}
|
||||
Size2D(size_t width_, size_t height_) : width(width_), height(height_) {}
|
||||
|
||||
size_t width;
|
||||
size_t height;
|
||||
|
||||
inline size_t total() const
|
||||
{
|
||||
return width * height;
|
||||
}
|
||||
};
|
||||
|
||||
struct Margin {
|
||||
Margin() : left(0), right(0), top(0), bottom(0) {}
|
||||
Margin(size_t left_, size_t right_, size_t top_, size_t bottom_)
|
||||
: left(left_), right(right_), top(top_), bottom(bottom_) {}
|
||||
|
||||
// these are measured in elements
|
||||
size_t left, right, top, bottom;
|
||||
};
|
||||
|
||||
struct KeypointStore {
|
||||
virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0;
|
||||
virtual ~KeypointStore() {};
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
241
3rdparty/carotene/src/absdiff.cpp
vendored
Normal file
241
3rdparty/carotene/src/absdiff.cpp
vendored
Normal file
@ -0,0 +1,241 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct AbsDiff
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vabdq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vabd(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0];
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct AbsDiffSigned
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
typename internal::VecTraits<T>::vec128 v_min = internal::vminq(v_src0, v_src1);
|
||||
typename internal::VecTraits<T>::vec128 v_max = internal::vmaxq(v_src0, v_src1);
|
||||
v_dst = internal::vqsubq(v_max, v_min);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
typename internal::VecTraits<T>::vec64 v_min = internal::vmin(v_src0, v_src1);
|
||||
typename internal::VecTraits<T>::vec64 v_max = internal::vmax(v_src0, v_src1);
|
||||
v_dst = internal::vqsub(v_max, v_min);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = internal::saturate_cast<T>(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void absDiff(const Size2D &size,
|
||||
const u8 *src0Base, ptrdiff_t src0Stride,
|
||||
const u8 *src1Base, ptrdiff_t src1Stride,
|
||||
u8 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, AbsDiff<u8>());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void absDiff(const Size2D &size,
|
||||
const u16 *src0Base, ptrdiff_t src0Stride,
|
||||
const u16 *src1Base, ptrdiff_t src1Stride,
|
||||
u16 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, AbsDiff<u16>());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void absDiff(const Size2D &size,
|
||||
const s8 *src0Base, ptrdiff_t src0Stride,
|
||||
const s8 *src1Base, ptrdiff_t src1Stride,
|
||||
s8 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, AbsDiffSigned<s8>());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void absDiff(const Size2D &size,
|
||||
const s16 *src0Base, ptrdiff_t src0Stride,
|
||||
const s16 *src1Base, ptrdiff_t src1Stride,
|
||||
s16 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, AbsDiffSigned<s16>());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void absDiff(const Size2D &size,
|
||||
const s32 *src0Base, ptrdiff_t src0Stride,
|
||||
const s32 *src1Base, ptrdiff_t src1Stride,
|
||||
s32 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, AbsDiffSigned<s32>());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void absDiff(const Size2D &size,
|
||||
const f32 * src0Base, ptrdiff_t src0Stride,
|
||||
const f32 * src1Base, ptrdiff_t src1Stride,
|
||||
f32 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, AbsDiff<f32>());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
408
3rdparty/carotene/src/accumulate.cpp
vendored
Normal file
408
3rdparty/carotene/src/accumulate.cpp
vendored
Normal file
@ -0,0 +1,408 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
void accumulate(const Size2D &size,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
s16 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
internal::prefetch(dst + j);
|
||||
uint8x16_t v_src = vld1q_u8(src + j);
|
||||
int16x8_t v_dst0 = vld1q_s16(dst + j);
|
||||
int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
|
||||
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
|
||||
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
|
||||
v_dst0 = vqaddq_s16(v_dst0, v_src0);
|
||||
v_dst1 = vqaddq_s16(v_dst1, v_src1);
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
uint8x8_t v_src = vld1_u8(src + j);
|
||||
int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
|
||||
int16x8_t v_dst = vld1q_s16(dst + j);
|
||||
v_dst = vqaddq_s16(v_dst, v_src16);
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <int shift>
|
||||
void accumulateSquareConst(const Size2D &size,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
s16 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
internal::prefetch(dst + j);
|
||||
uint8x16_t v_src = vld1q_u8(src + j);
|
||||
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
|
||||
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
|
||||
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
|
||||
|
||||
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
|
||||
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
|
||||
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
|
||||
|
||||
v_srclo = vget_low_s16(v_src1);
|
||||
v_srchi = vget_high_s16(v_src1);
|
||||
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
|
||||
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
|
||||
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
|
||||
int16x8_t v_dst = vld1q_s16(dst + j);
|
||||
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
|
||||
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
|
||||
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
s32 srcVal = src[j];
|
||||
dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void accumulateSquareConst<0>(const Size2D &size,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
s16 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
internal::prefetch(dst + j);
|
||||
uint8x16_t v_src = vld1q_u8(src + j);
|
||||
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
|
||||
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
|
||||
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
|
||||
|
||||
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
|
||||
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
|
||||
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
|
||||
|
||||
v_srclo = vget_low_s16(v_src1);
|
||||
v_srchi = vget_high_s16(v_src1);
|
||||
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
|
||||
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
|
||||
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
|
||||
int16x8_t v_dst = vld1q_s16(dst + j);
|
||||
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
|
||||
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
|
||||
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
s32 srcVal = src[j];
|
||||
dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (* accumulateSquareConstFunc)(const Size2D &size,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
s16 *dstBase, ptrdiff_t dstStride);
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void accumulateSquare(const Size2D &size,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
s16 *dstBase, ptrdiff_t dstStride,
|
||||
u32 shift)
|
||||
{
|
||||
if (shift >= 16)
|
||||
{
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
std::memset(dst, 0, sizeof(s16) * size.width);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
internal::assertSupportedConfiguration();
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
// this ugly contruction is needed to avoid:
|
||||
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
|
||||
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
|
||||
|
||||
accumulateSquareConstFunc funcs[16] =
|
||||
{
|
||||
accumulateSquareConst<0>,
|
||||
accumulateSquareConst<1>,
|
||||
accumulateSquareConst<2>,
|
||||
accumulateSquareConst<3>,
|
||||
accumulateSquareConst<4>,
|
||||
accumulateSquareConst<5>,
|
||||
accumulateSquareConst<6>,
|
||||
accumulateSquareConst<7>,
|
||||
accumulateSquareConst<8>,
|
||||
accumulateSquareConst<9>,
|
||||
accumulateSquareConst<10>,
|
||||
accumulateSquareConst<11>,
|
||||
accumulateSquareConst<12>,
|
||||
accumulateSquareConst<13>,
|
||||
accumulateSquareConst<14>,
|
||||
accumulateSquareConst<15>
|
||||
}, func = funcs[shift];
|
||||
|
||||
func(size, srcBase, srcStride, dstBase, dstStride);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)shift;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
struct AccumulateWeightedHalf
|
||||
{
|
||||
typedef u8 type;
|
||||
|
||||
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
|
||||
uint8x16_t & v_dst) const
|
||||
{
|
||||
v_dst = vhaddq_u8(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
|
||||
uint8x8_t & v_dst) const
|
||||
{
|
||||
v_dst = vhadd_u8(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
|
||||
}
|
||||
};
|
||||
|
||||
struct AccumulateWeighted
|
||||
{
|
||||
typedef u8 type;
|
||||
|
||||
float alpha, beta;
|
||||
float32x4_t v_alpha, v_beta;
|
||||
|
||||
explicit AccumulateWeighted(float _alpha) :
|
||||
alpha(_alpha), beta(1 - _alpha)
|
||||
{
|
||||
v_alpha = vdupq_n_f32(alpha);
|
||||
v_beta = vdupq_n_f32(beta);
|
||||
}
|
||||
|
||||
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
|
||||
uint8x16_t & v_dst) const
|
||||
{
|
||||
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
|
||||
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
|
||||
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
|
||||
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
|
||||
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
|
||||
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
|
||||
uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
|
||||
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
|
||||
|
||||
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
|
||||
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
|
||||
v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
|
||||
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
|
||||
v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
|
||||
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
|
||||
uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
|
||||
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
|
||||
|
||||
v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
|
||||
}
|
||||
|
||||
void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
|
||||
uint8x8_t & v_dst) const
|
||||
{
|
||||
uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
|
||||
|
||||
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
|
||||
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
|
||||
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
|
||||
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
|
||||
uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
|
||||
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
|
||||
|
||||
v_dst = vmovn_u16(_v_dst);
|
||||
}
|
||||
|
||||
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = beta * src1[0] + alpha * src0[0];
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void accumulateWeighted(const Size2D &size,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
u8 *dstBase, ptrdiff_t dstStride,
|
||||
f32 alpha)
|
||||
{
|
||||
if (alpha == 0.0f)
|
||||
return;
|
||||
if (alpha == 1.0f)
|
||||
{
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
std::memcpy(dst, src, sizeof(u8) * size.width);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
internal::assertSupportedConfiguration();
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
// in this case we can use the following scheme:
|
||||
// dst[p] = (src[p] + dst[p]) >> 1
|
||||
// which is faster
|
||||
if (alpha == 0.5f)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
srcBase, srcStride,
|
||||
dstBase, dstStride,
|
||||
dstBase, dstStride,
|
||||
AccumulateWeightedHalf());
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
internal::vtransform(size,
|
||||
srcBase, srcStride,
|
||||
dstBase, dstStride,
|
||||
dstBase, dstStride,
|
||||
AccumulateWeighted(alpha));
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)alpha;
|
||||
#endif
|
||||
}
|
||||
|
||||
} //namespace CAROTENE_NS
|
475
3rdparty/carotene/src/add.cpp
vendored
Normal file
475
3rdparty/carotene/src/add.cpp
vendored
Normal file
@ -0,0 +1,475 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T, typename WT>
|
||||
struct AddWrap
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vaddq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vadd(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename WT>
|
||||
struct AddSaturate
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vqaddq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vqadd(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void add(const Size2D &size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const u8 * src1Base, ptrdiff_t src1Stride,
|
||||
u8 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddSaturate<u8, u16>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddWrap<u8, u16>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add(const Size2D &size,
|
||||
const s8 * src0Base, ptrdiff_t src0Stride,
|
||||
const s8 * src1Base, ptrdiff_t src1Stride,
|
||||
s8 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddSaturate<s8, s16>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddWrap<s8, s16>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add(const Size2D &size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const u8 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw32; j += 32)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
|
||||
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
|
||||
vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
|
||||
vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
|
||||
vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
|
||||
vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
uint8x8_t v_src0 = vld1_u8(src0 + j);
|
||||
uint8x8_t v_src1 = vld1_u8(src1 + j);
|
||||
vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = (u16)src0[j] + (u16)src1[j];
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add(const Size2D &size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const s16 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
uint8x16_t v_src0 = vld1q_u8(src0 + j);
|
||||
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
|
||||
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
|
||||
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
|
||||
int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
|
||||
int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
|
||||
int16x8_t v_src1 = vld1q_s16(src1 + j);
|
||||
int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
uint8x16_t v_src0 = vld1q_u8(src0 + j);
|
||||
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
|
||||
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
|
||||
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
|
||||
int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
|
||||
int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
|
||||
int16x8_t v_src1 = vld1q_s16(src1 + j);
|
||||
int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add(const Size2D &size,
|
||||
const s16 * src0Base, ptrdiff_t src0Stride,
|
||||
const s16 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddSaturate<s16, s32>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddWrap<s16, s32>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add(const Size2D &size,
|
||||
const u16 * src0Base, ptrdiff_t src0Stride,
|
||||
const u16 * src1Base, ptrdiff_t src1Stride,
|
||||
u16 * dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddSaturate<u16, u32>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddWrap<u16, u32>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add(const Size2D &size,
|
||||
const s32 * src0Base, ptrdiff_t src0Stride,
|
||||
const s32 * src1Base, ptrdiff_t src1Stride,
|
||||
s32 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddSaturate<s32, s64>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddWrap<s32, s64>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add(const Size2D &size,
|
||||
const u32 * src0Base, ptrdiff_t src0Stride,
|
||||
const u32 * src1Base, ptrdiff_t src1Stride,
|
||||
u32 * dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddSaturate<u32, u64>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddWrap<u32, u64>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add(const Size2D &size,
|
||||
const f32 * src0Base, ptrdiff_t src0Stride,
|
||||
const f32 * src1Base, ptrdiff_t src1Stride,
|
||||
f32 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
AddWrap<f32, f32>());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
265
3rdparty/carotene/src/add_weighted.cpp
vendored
Normal file
265
3rdparty/carotene/src/add_weighted.cpp
vendored
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
using namespace internal;
|
||||
|
||||
template <typename T> struct TypeTraits;
|
||||
template <> struct TypeTraits< u8> { typedef u16 wide; typedef u8 unsign; typedef uint8x16_t vec128; };
|
||||
template <> struct TypeTraits< s8> { typedef s16 wide; typedef u8 unsign; typedef int8x16_t vec128; };
|
||||
template <> struct TypeTraits<u16> { typedef u32 wide; typedef u8 narrow; typedef u16 unsign; typedef uint16x8_t vec128; };
|
||||
template <> struct TypeTraits<s16> { typedef s32 wide; typedef s8 narrow; typedef u16 unsign; typedef int16x8_t vec128; };
|
||||
template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef uint32x4_t vec128; };
|
||||
template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef int32x4_t vec128; };
|
||||
template <> struct TypeTraits<f32> { typedef f64 wide; typedef float32x4_t vec128; };
|
||||
|
||||
template <typename T> struct wAdd
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
f32 alpha, beta, gamma;
|
||||
typedef typename TypeTraits<T>::wide wtype;
|
||||
wAdd<wtype> wideAdd;
|
||||
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
|
||||
alpha(_alpha), beta(_beta), gamma(_gamma),
|
||||
wideAdd(_alpha, _beta, _gamma) {}
|
||||
|
||||
void operator() (const typename VecTraits<T>::vec128 & v_src0,
|
||||
const typename VecTraits<T>::vec128 & v_src1,
|
||||
typename VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
typename VecTraits<wtype>::vec128 vrl, vrh;
|
||||
wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);
|
||||
wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);
|
||||
|
||||
v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));
|
||||
}
|
||||
|
||||
void operator() (const typename VecTraits<T>::vec64 & v_src0,
|
||||
const typename VecTraits<T>::vec64 & v_src1,
|
||||
typename VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
typename VecTraits<wtype>::vec128 vr;
|
||||
wideAdd(vmovl(v_src0), vmovl(v_src1), vr);
|
||||
|
||||
v_dst = vqmovn(vr);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct wAdd<s32>
|
||||
{
|
||||
typedef s32 type;
|
||||
|
||||
f32 alpha, beta, gamma;
|
||||
float32x4_t valpha, vbeta, vgamma;
|
||||
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
|
||||
alpha(_alpha), beta(_beta), gamma(_gamma)
|
||||
{
|
||||
valpha = vdupq_n_f32(_alpha);
|
||||
vbeta = vdupq_n_f32(_beta);
|
||||
vgamma = vdupq_n_f32(_gamma + 0.5);
|
||||
}
|
||||
|
||||
void operator() (const typename VecTraits<s32>::vec128 & v_src0,
|
||||
const typename VecTraits<s32>::vec128 & v_src1,
|
||||
typename VecTraits<s32>::vec128 & v_dst) const
|
||||
{
|
||||
float32x4_t vs1 = vcvtq_f32_s32(v_src0);
|
||||
float32x4_t vs2 = vcvtq_f32_s32(v_src1);
|
||||
|
||||
vs1 = vmlaq_f32(vgamma, vs1, valpha);
|
||||
vs1 = vmlaq_f32(vs1, vs2, vbeta);
|
||||
v_dst = vcvtq_s32_f32(vs1);
|
||||
}
|
||||
|
||||
void operator() (const typename VecTraits<s32>::vec64 & v_src0,
|
||||
const typename VecTraits<s32>::vec64 & v_src1,
|
||||
typename VecTraits<s32>::vec64 & v_dst) const
|
||||
{
|
||||
float32x2_t vs1 = vcvt_f32_s32(v_src0);
|
||||
float32x2_t vs2 = vcvt_f32_s32(v_src1);
|
||||
|
||||
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
|
||||
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
|
||||
v_dst = vcvt_s32_f32(vs1);
|
||||
}
|
||||
|
||||
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
|
||||
{
|
||||
dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct wAdd<u32>
|
||||
{
|
||||
typedef u32 type;
|
||||
|
||||
f32 alpha, beta, gamma;
|
||||
float32x4_t valpha, vbeta, vgamma;
|
||||
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
|
||||
alpha(_alpha), beta(_beta), gamma(_gamma)
|
||||
{
|
||||
valpha = vdupq_n_f32(_alpha);
|
||||
vbeta = vdupq_n_f32(_beta);
|
||||
vgamma = vdupq_n_f32(_gamma + 0.5);
|
||||
}
|
||||
|
||||
void operator() (const typename VecTraits<u32>::vec128 & v_src0,
|
||||
const typename VecTraits<u32>::vec128 & v_src1,
|
||||
typename VecTraits<u32>::vec128 & v_dst) const
|
||||
{
|
||||
float32x4_t vs1 = vcvtq_f32_u32(v_src0);
|
||||
float32x4_t vs2 = vcvtq_f32_u32(v_src1);
|
||||
|
||||
vs1 = vmlaq_f32(vgamma, vs1, valpha);
|
||||
vs1 = vmlaq_f32(vs1, vs2, vbeta);
|
||||
v_dst = vcvtq_u32_f32(vs1);
|
||||
}
|
||||
|
||||
void operator() (const typename VecTraits<u32>::vec64 & v_src0,
|
||||
const typename VecTraits<u32>::vec64 & v_src1,
|
||||
typename VecTraits<u32>::vec64 & v_dst) const
|
||||
{
|
||||
float32x2_t vs1 = vcvt_f32_u32(v_src0);
|
||||
float32x2_t vs2 = vcvt_f32_u32(v_src1);
|
||||
|
||||
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
|
||||
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
|
||||
v_dst = vcvt_u32_f32(vs1);
|
||||
}
|
||||
|
||||
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
|
||||
{
|
||||
dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct wAdd<f32>
|
||||
{
|
||||
typedef f32 type;
|
||||
|
||||
f32 alpha, beta, gamma;
|
||||
float32x4_t valpha, vbeta, vgamma;
|
||||
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
|
||||
alpha(_alpha), beta(_beta), gamma(_gamma)
|
||||
{
|
||||
valpha = vdupq_n_f32(_alpha);
|
||||
vbeta = vdupq_n_f32(_beta);
|
||||
vgamma = vdupq_n_f32(_gamma + 0.5);
|
||||
}
|
||||
|
||||
void operator() (const typename VecTraits<f32>::vec128 & v_src0,
|
||||
const typename VecTraits<f32>::vec128 & v_src1,
|
||||
typename VecTraits<f32>::vec128 & v_dst) const
|
||||
{
|
||||
float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);
|
||||
v_dst = vmlaq_f32(vs1, v_src1, vbeta);
|
||||
}
|
||||
|
||||
void operator() (const typename VecTraits<f32>::vec64 & v_src0,
|
||||
const typename VecTraits<f32>::vec64 & v_src1,
|
||||
typename VecTraits<f32>::vec64 & v_dst) const
|
||||
{
|
||||
float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));
|
||||
v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));
|
||||
|
||||
}
|
||||
|
||||
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
|
||||
{
|
||||
dst[0] = alpha*src0[0] + beta*src1[0] + gamma;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#define IMPL_ADDWEIGHTED(type) \
|
||||
void addWeighted(const Size2D &size, \
|
||||
const type * src0Base, ptrdiff_t src0Stride, \
|
||||
const type * src1Base, ptrdiff_t src1Stride, \
|
||||
type * dstBase, ptrdiff_t dstStride, \
|
||||
f32 alpha, f32 beta, f32 gamma) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
wAdd<type> wgtAdd(alpha, \
|
||||
beta, \
|
||||
gamma); \
|
||||
internal::vtransform(size, \
|
||||
src0Base, src0Stride, \
|
||||
src1Base, src1Stride, \
|
||||
dstBase, dstStride, \
|
||||
wgtAdd); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define IMPL_ADDWEIGHTED(type) \
|
||||
void addWeighted(const Size2D &, \
|
||||
const type *, ptrdiff_t, \
|
||||
const type *, ptrdiff_t, \
|
||||
type *, ptrdiff_t, \
|
||||
f32, f32, f32) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
IMPL_ADDWEIGHTED(u8)
|
||||
IMPL_ADDWEIGHTED(s8)
|
||||
IMPL_ADDWEIGHTED(u16)
|
||||
IMPL_ADDWEIGHTED(s16)
|
||||
IMPL_ADDWEIGHTED(u32)
|
||||
IMPL_ADDWEIGHTED(s32)
|
||||
IMPL_ADDWEIGHTED(f32)
|
||||
|
||||
} // namespace CAROTENE_NS
|
225
3rdparty/carotene/src/bitwise.cpp
vendored
Normal file
225
3rdparty/carotene/src/bitwise.cpp
vendored
Normal file
@ -0,0 +1,225 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
struct BitwiseAnd
|
||||
{
|
||||
typedef u8 type;
|
||||
|
||||
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
|
||||
uint8x16_t & v_dst) const
|
||||
{
|
||||
v_dst = vandq_u8(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
|
||||
uint8x8_t & v_dst) const
|
||||
{
|
||||
v_dst = vand_u8(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = src0[0] & src1[0];
|
||||
}
|
||||
};
|
||||
|
||||
struct BitwiseOr
|
||||
{
|
||||
typedef u8 type;
|
||||
|
||||
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
|
||||
uint8x16_t & v_dst) const
|
||||
{
|
||||
v_dst = vorrq_u8(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
|
||||
uint8x8_t & v_dst) const
|
||||
{
|
||||
v_dst = vorr_u8(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = src0[0] | src1[0];
|
||||
}
|
||||
};
|
||||
|
||||
struct BitwiseXor
|
||||
{
|
||||
typedef u8 type;
|
||||
|
||||
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
|
||||
uint8x16_t & v_dst) const
|
||||
{
|
||||
v_dst = veorq_u8(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
|
||||
uint8x8_t & v_dst) const
|
||||
{
|
||||
v_dst = veor_u8(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = src0[0] ^ src1[0];
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
void bitwiseNot(const Size2D &size,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
u8 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw32; j += 32)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
|
||||
uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1);
|
||||
vst1q_u8(dst + j, v_dst0);
|
||||
vst1q_u8(dst + j + 16, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
uint8x8_t v_src = vld1_u8(src + j);
|
||||
uint8x8_t v_dst = vmvn_u8(v_src);
|
||||
vst1_u8(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = ~src[j];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void bitwiseAnd(const Size2D &size,
|
||||
const u8 *src0Base, ptrdiff_t src0Stride,
|
||||
const u8 *src1Base, ptrdiff_t src1Stride,
|
||||
u8 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, BitwiseAnd());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void bitwiseOr(const Size2D &size,
|
||||
const u8 *src0Base, ptrdiff_t src0Stride,
|
||||
const u8 *src1Base, ptrdiff_t src1Stride,
|
||||
u8 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, BitwiseOr());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void bitwiseXor(const Size2D &size,
|
||||
const u8 *src0Base, ptrdiff_t src0Stride,
|
||||
const u8 *src1Base, ptrdiff_t src1Stride,
|
||||
u8 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride, BitwiseXor());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
1337
3rdparty/carotene/src/blur.cpp
vendored
Normal file
1337
3rdparty/carotene/src/blur.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
773
3rdparty/carotene/src/canny.cpp
vendored
Normal file
773
3rdparty/carotene/src/canny.cpp
vendored
Normal file
@ -0,0 +1,773 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "saturate_cast.hpp"
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
namespace {
|
||||
struct RowFilter3x3Canny
|
||||
{
|
||||
inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr)
|
||||
{
|
||||
vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
|
||||
vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
|
||||
lookLeft = offsetk - borderxl;
|
||||
lookRight = offsetk - borderxr;
|
||||
}
|
||||
|
||||
inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width)
|
||||
{
|
||||
uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
|
||||
ptrdiff_t i = 0;
|
||||
for (; i < width - 8 + lookRight; i += 8)
|
||||
{
|
||||
internal::prefetch(src + i);
|
||||
uint8x8_t l18u = vld1_u8(src + i + 1);
|
||||
|
||||
uint8x8_t l2 = l18u;
|
||||
uint8x8_t l0 = vext_u8(l, l18u, 6);
|
||||
int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1));
|
||||
|
||||
l = l18u;
|
||||
|
||||
int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0));
|
||||
int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0));
|
||||
int16x8_t ldy = vaddq_s16(l02, l1x2);
|
||||
|
||||
vst1q_s16(dstx + i, ldx);
|
||||
vst1q_s16(dsty + i, ldy);
|
||||
}
|
||||
|
||||
//tail
|
||||
if (lookRight == 0 || i != width)
|
||||
{
|
||||
uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
|
||||
uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
|
||||
uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
|
||||
|
||||
int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0));
|
||||
int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
|
||||
int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
|
||||
int16x8_t taildy = vqaddq_s16(tail02, tail1x2);
|
||||
|
||||
vst1q_s16(dstx + (width - 8), taildx);
|
||||
vst1q_s16(dsty + (width - 8), taildy);
|
||||
}
|
||||
}
|
||||
|
||||
uint8x8_t vfmask;
|
||||
uint8x8_t vtmask;
|
||||
enum { offsetk = 1};
|
||||
ptrdiff_t lookLeft;
|
||||
ptrdiff_t lookRight;
|
||||
};
|
||||
|
||||
template <bool L2gradient>
|
||||
inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
|
||||
{
|
||||
ptrdiff_t j = 0;
|
||||
for (; j <= width - 8; j += 8)
|
||||
{
|
||||
ColFilter3x3CannyL1Loop:
|
||||
int16x8_t line0x = vld1q_s16(src0 + j);
|
||||
int16x8_t line1x = vld1q_s16(src1 + j);
|
||||
int16x8_t line2x = vld1q_s16(src2 + j);
|
||||
int16x8_t line0y = vld1q_s16(src0 + j + width);
|
||||
int16x8_t line2y = vld1q_s16(src2 + j + width);
|
||||
|
||||
int16x8_t l02 = vaddq_s16(line0x, line2x);
|
||||
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
|
||||
int16x8_t dy = vsubq_s16(line2y, line0y);
|
||||
int16x8_t dx = vaddq_s16(l1x2, l02);
|
||||
|
||||
int16x8_t dya = vabsq_s16(dy);
|
||||
int16x8_t dxa = vabsq_s16(dx);
|
||||
int16x8_t norm = vaddq_s16(dya, dxa);
|
||||
|
||||
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
|
||||
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
|
||||
|
||||
vst1q_s16(dsty + j, dy);
|
||||
vst1q_s16(dstx + j, dx);
|
||||
vst1q_s32(mag + j + 4, normh);
|
||||
vst1q_s32(mag + j, norml);
|
||||
}
|
||||
if (j != width)
|
||||
{
|
||||
j = width - 8;
|
||||
goto ColFilter3x3CannyL1Loop;
|
||||
}
|
||||
}
|
||||
template <>
|
||||
inline void ColFilter3x3Canny<true>(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
|
||||
{
|
||||
ptrdiff_t j = 0;
|
||||
for (; j <= width - 8; j += 8)
|
||||
{
|
||||
ColFilter3x3CannyL2Loop:
|
||||
int16x8_t line0x = vld1q_s16(src0 + j);
|
||||
int16x8_t line1x = vld1q_s16(src1 + j);
|
||||
int16x8_t line2x = vld1q_s16(src2 + j);
|
||||
int16x8_t line0y = vld1q_s16(src0 + j + width);
|
||||
int16x8_t line2y = vld1q_s16(src2 + j + width);
|
||||
|
||||
int16x8_t l02 = vaddq_s16(line0x, line2x);
|
||||
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
|
||||
int16x8_t dy = vsubq_s16(line2y, line0y);
|
||||
int16x8_t dx = vaddq_s16(l1x2, l02);
|
||||
|
||||
int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx));
|
||||
int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy));
|
||||
|
||||
norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy));
|
||||
normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx));
|
||||
|
||||
vst1q_s16(dsty + j, dy);
|
||||
vst1q_s16(dstx + j, dx);
|
||||
vst1q_s32(mag + j, norml);
|
||||
vst1q_s32(mag + j + 4, normh);
|
||||
}
|
||||
if (j != width)
|
||||
{
|
||||
j = width - 8;
|
||||
goto ColFilter3x3CannyL2Loop;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool L2gradient>
|
||||
inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
|
||||
{
|
||||
ptrdiff_t j = 0;
|
||||
if (colscn >= 8)
|
||||
{
|
||||
int16x8_t vx = vld1q_s16(_dx);
|
||||
int16x8_t vy = vld1q_s16(_dy);
|
||||
for (; j <= colscn - 16; j+=8)
|
||||
{
|
||||
internal::prefetch(_dx);
|
||||
internal::prefetch(_dy);
|
||||
|
||||
int16x8_t vx2 = vld1q_s16(_dx + j + 8);
|
||||
int16x8_t vy2 = vld1q_s16(_dy + j + 8);
|
||||
|
||||
int16x8_t vabsx = vabsq_s16(vx);
|
||||
int16x8_t vabsy = vabsq_s16(vy);
|
||||
|
||||
int16x8_t norm = vaddq_s16(vabsx, vabsy);
|
||||
|
||||
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
|
||||
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
|
||||
|
||||
vst1q_s32(_norm + j + 4, normh);
|
||||
vst1q_s32(_norm + j + 0, norml);
|
||||
|
||||
vx = vx2;
|
||||
vy = vy2;
|
||||
}
|
||||
int16x8_t vabsx = vabsq_s16(vx);
|
||||
int16x8_t vabsy = vabsq_s16(vy);
|
||||
|
||||
int16x8_t norm = vaddq_s16(vabsx, vabsy);
|
||||
|
||||
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
|
||||
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
|
||||
|
||||
vst1q_s32(_norm + j + 4, normh);
|
||||
vst1q_s32(_norm + j + 0, norml);
|
||||
}
|
||||
for (; j < colscn; j++)
|
||||
_norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j]));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void NormCanny<true>(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
|
||||
{
|
||||
ptrdiff_t j = 0;
|
||||
if (colscn >= 8)
|
||||
{
|
||||
int16x8_t vx = vld1q_s16(_dx);
|
||||
int16x8_t vy = vld1q_s16(_dy);
|
||||
|
||||
for (; j <= colscn - 16; j+=8)
|
||||
{
|
||||
internal::prefetch(_dx);
|
||||
internal::prefetch(_dy);
|
||||
|
||||
int16x8_t vxnext = vld1q_s16(_dx + j + 8);
|
||||
int16x8_t vynext = vld1q_s16(_dy + j + 8);
|
||||
|
||||
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
|
||||
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
|
||||
|
||||
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
|
||||
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
|
||||
|
||||
vst1q_s32(_norm + j + 0, norml);
|
||||
vst1q_s32(_norm + j + 4, normh);
|
||||
|
||||
vx = vxnext;
|
||||
vy = vynext;
|
||||
}
|
||||
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
|
||||
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
|
||||
|
||||
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
|
||||
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
|
||||
|
||||
vst1q_s32(_norm + j + 0, norml);
|
||||
vst1q_s32(_norm + j + 4, normh);
|
||||
}
|
||||
for (; j < colscn; j++)
|
||||
_norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j];
|
||||
}
|
||||
|
||||
template <bool L2gradient>
|
||||
inline void prepareThresh(f64 low_thresh, f64 high_thresh,
|
||||
s32 &low, s32 &high)
|
||||
{
|
||||
if (low_thresh > high_thresh)
|
||||
std::swap(low_thresh, high_thresh);
|
||||
#if defined __GNUC__
|
||||
low = (s32)low_thresh;
|
||||
high = (s32)high_thresh;
|
||||
low -= (low > low_thresh);
|
||||
high -= (high > high_thresh);
|
||||
#else
|
||||
low = internal::round(low_thresh);
|
||||
high = internal::round(high_thresh);
|
||||
f32 ldiff = (f32)(low_thresh - low);
|
||||
f32 hdiff = (f32)(high_thresh - high);
|
||||
low -= (ldiff < 0);
|
||||
high -= (hdiff < 0);
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
inline void prepareThresh<true>(f64 low_thresh, f64 high_thresh,
|
||||
s32 &low, s32 &high)
|
||||
{
|
||||
if (low_thresh > high_thresh)
|
||||
std::swap(low_thresh, high_thresh);
|
||||
if (low_thresh > 0) low_thresh *= low_thresh;
|
||||
if (high_thresh > 0) high_thresh *= high_thresh;
|
||||
#if defined __GNUC__
|
||||
low = (s32)low_thresh;
|
||||
high = (s32)high_thresh;
|
||||
low -= (low > low_thresh);
|
||||
high -= (high > high_thresh);
|
||||
#else
|
||||
low = internal::round(low_thresh);
|
||||
high = internal::round(high_thresh);
|
||||
f32 ldiff = (f32)(low_thresh - low);
|
||||
f32 hdiff = (f32)(high_thresh - high);
|
||||
low -= (ldiff < 0);
|
||||
high -= (hdiff < 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <bool L2gradient, bool externalSobel>
|
||||
struct _normEstimator
|
||||
{
|
||||
ptrdiff_t magstep;
|
||||
ptrdiff_t dxOffset;
|
||||
ptrdiff_t dyOffset;
|
||||
ptrdiff_t shxOffset;
|
||||
ptrdiff_t shyOffset;
|
||||
std::vector<u8> buffer;
|
||||
const ptrdiff_t offsetk;
|
||||
ptrdiff_t borderyt, borderyb;
|
||||
RowFilter3x3Canny sobelRow;
|
||||
|
||||
inline _normEstimator(const Size2D &size, s32, Margin borderMargin,
|
||||
ptrdiff_t &mapstep, s32** mag_buf, u8* &map):
|
||||
offsetk(1),
|
||||
sobelRow(std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left),
|
||||
std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right))
|
||||
{
|
||||
mapstep = size.width + 2;
|
||||
magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32));
|
||||
dxOffset = mapstep * sizeof(s32)/sizeof(s16);
|
||||
dyOffset = dxOffset + size.width * 1;
|
||||
shxOffset = dxOffset + size.width * 2;
|
||||
shyOffset = dxOffset + size.width * 3;
|
||||
buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) );
|
||||
mag_buf[0] = (s32*)&buffer[0];
|
||||
mag_buf[1] = mag_buf[0] + magstep;
|
||||
mag_buf[2] = mag_buf[1] + magstep;
|
||||
memset(mag_buf[0], 0, mapstep * sizeof(s32));
|
||||
|
||||
map = (u8*)(mag_buf[2] + magstep);
|
||||
memset(map, 1, mapstep);
|
||||
memset(map + mapstep*(size.height + 1), 1, mapstep);
|
||||
borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
|
||||
borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
|
||||
}
|
||||
inline void firstRow(const Size2D &size, s32,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
s16*, ptrdiff_t,
|
||||
s16*, ptrdiff_t,
|
||||
s32** mag_buf)
|
||||
{
|
||||
//sobelH row #0
|
||||
const u8* _src = internal::getRowPtr(srcBase, srcStride, 0);
|
||||
sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width);
|
||||
//sobelH row #1
|
||||
_src = internal::getRowPtr(srcBase, srcStride, 1);
|
||||
sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width);
|
||||
|
||||
mag_buf[1][0] = mag_buf[1][size.width+1] = 0;
|
||||
if (borderyt == 0)
|
||||
{
|
||||
//sobelH row #-1
|
||||
_src = internal::getRowPtr(srcBase, srcStride, -1);
|
||||
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
|
||||
|
||||
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
|
||||
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
|
||||
}
|
||||
else
|
||||
{
|
||||
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
|
||||
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
|
||||
}
|
||||
}
|
||||
inline void nextRow(const Size2D &size, s32,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
s16*, ptrdiff_t,
|
||||
s16*, ptrdiff_t,
|
||||
const ptrdiff_t &mapstep, s32** mag_buf,
|
||||
size_t i, const s16* &_x, const s16* &_y)
|
||||
{
|
||||
mag_buf[2][0] = mag_buf[2][size.width+1] = 0;
|
||||
if (i < size.height - borderyb)
|
||||
{
|
||||
const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1);
|
||||
//sobelH row #i+1
|
||||
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
|
||||
|
||||
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset,
|
||||
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
|
||||
}
|
||||
else if (i < size.height)
|
||||
{
|
||||
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
|
||||
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
|
||||
}
|
||||
else
|
||||
memset(mag_buf[2], 0, mapstep*sizeof(s32));
|
||||
_x = ((s16*)mag_buf[1]) + dxOffset;
|
||||
_y = ((s16*)mag_buf[1]) + dyOffset;
|
||||
}
|
||||
};
|
||||
template <bool L2gradient>
|
||||
struct _normEstimator<L2gradient, true>
|
||||
{
|
||||
std::vector<u8> buffer;
|
||||
|
||||
inline _normEstimator(const Size2D &size, s32 cn, Margin,
|
||||
ptrdiff_t &mapstep, s32** mag_buf, u8* &map)
|
||||
{
|
||||
mapstep = size.width + 2;
|
||||
buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) );
|
||||
mag_buf[0] = (s32*)&buffer[0];
|
||||
mag_buf[1] = mag_buf[0] + mapstep*cn;
|
||||
mag_buf[2] = mag_buf[1] + mapstep*cn;
|
||||
memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32));
|
||||
|
||||
map = (u8*)(mag_buf[2] + mapstep*cn);
|
||||
memset(map, 1, mapstep);
|
||||
memset(map + mapstep*(size.height + 1), 1, mapstep);
|
||||
}
|
||||
inline void firstRow(const Size2D &size, s32 cn,
|
||||
const u8 *, ptrdiff_t,
|
||||
s16* dxBase, ptrdiff_t dxStride,
|
||||
s16* dyBase, ptrdiff_t dyStride,
|
||||
s32** mag_buf)
|
||||
{
|
||||
s32* _norm = mag_buf[1] + 1;
|
||||
|
||||
s16* _dx = internal::getRowPtr(dxBase, dxStride, 0);
|
||||
s16* _dy = internal::getRowPtr(dyBase, dyStride, 0);
|
||||
|
||||
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
|
||||
|
||||
if(cn > 1)
|
||||
{
|
||||
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
|
||||
{
|
||||
size_t maxIdx = jn;
|
||||
for(s32 k = 1; k < cn; ++k)
|
||||
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
|
||||
_norm[j] = _norm[maxIdx];
|
||||
_dx[j] = _dx[maxIdx];
|
||||
_dy[j] = _dy[maxIdx];
|
||||
}
|
||||
}
|
||||
|
||||
_norm[-1] = _norm[size.width] = 0;
|
||||
}
|
||||
inline void nextRow(const Size2D &size, s32 cn,
|
||||
const u8 *, ptrdiff_t,
|
||||
s16* dxBase, ptrdiff_t dxStride,
|
||||
s16* dyBase, ptrdiff_t dyStride,
|
||||
const ptrdiff_t &mapstep, s32** mag_buf,
|
||||
size_t i, const s16* &_x, const s16* &_y)
|
||||
{
|
||||
s32* _norm = mag_buf[(i > 0) + 1] + 1;
|
||||
if (i < size.height)
|
||||
{
|
||||
s16* _dx = internal::getRowPtr(dxBase, dxStride, i);
|
||||
s16* _dy = internal::getRowPtr(dyBase, dyStride, i);
|
||||
|
||||
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
|
||||
|
||||
if(cn > 1)
|
||||
{
|
||||
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
|
||||
{
|
||||
size_t maxIdx = jn;
|
||||
for(s32 k = 1; k < cn; ++k)
|
||||
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
|
||||
_norm[j] = _norm[maxIdx];
|
||||
_dx[j] = _dx[maxIdx];
|
||||
_dy[j] = _dy[maxIdx];
|
||||
}
|
||||
}
|
||||
|
||||
_norm[-1] = _norm[size.width] = 0;
|
||||
}
|
||||
else
|
||||
memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32));
|
||||
|
||||
_x = internal::getRowPtr(dxBase, dxStride, i-1);
|
||||
_y = internal::getRowPtr(dyBase, dyStride, i-1);
|
||||
}
|
||||
};
|
||||
|
||||
template <bool L2gradient, bool externalSobel>
|
||||
inline void Canny3x3(const Size2D &size, s32 cn,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
s16 * dxBase, ptrdiff_t dxStride,
|
||||
s16 * dyBase, ptrdiff_t dyStride,
|
||||
f64 low_thresh, f64 high_thresh,
|
||||
Margin borderMargin)
|
||||
{
|
||||
s32 low, high;
|
||||
prepareThresh<L2gradient>(low_thresh, high_thresh, low, high);
|
||||
|
||||
ptrdiff_t mapstep;
|
||||
s32* mag_buf[3];
|
||||
u8* map;
|
||||
_normEstimator<L2gradient, externalSobel> normEstimator(size, cn, borderMargin, mapstep, mag_buf, map);
|
||||
|
||||
size_t maxsize = std::max<size_t>( 1u << 10, size.width * size.height / 10 );
|
||||
std::vector<u8*> stack( maxsize );
|
||||
u8 **stack_top = &stack[0];
|
||||
u8 **stack_bottom = &stack[0];
|
||||
|
||||
/* sector numbers
|
||||
(Top-Left Origin)
|
||||
|
||||
1 2 3
|
||||
* * *
|
||||
* * *
|
||||
0*******0
|
||||
* * *
|
||||
* * *
|
||||
3 2 1
|
||||
*/
|
||||
|
||||
#define CANNY_PUSH(d) *(d) = u8(2), *stack_top++ = (d)
|
||||
#define CANNY_POP(d) (d) = *--stack_top
|
||||
|
||||
//i == 0
|
||||
normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf);
|
||||
// calculate magnitude and angle of gradient, perform non-maxima supression.
|
||||
// fill the map with one of the following values:
|
||||
// 0 - the pixel might belong to an edge
|
||||
// 1 - the pixel can not belong to an edge
|
||||
// 2 - the pixel does belong to an edge
|
||||
for (size_t i = 1; i <= size.height; i++)
|
||||
{
|
||||
const s16 *_x, *_y;
|
||||
normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y);
|
||||
|
||||
u8* _map = map + mapstep*i + 1;
|
||||
_map[-1] = _map[size.width] = 1;
|
||||
|
||||
s32* _mag = mag_buf[1] + 1; // take the central row
|
||||
ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
|
||||
ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
|
||||
|
||||
if ((stack_top - stack_bottom) + size.width > maxsize)
|
||||
{
|
||||
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
|
||||
maxsize = maxsize * 3/2;
|
||||
stack.resize(maxsize);
|
||||
stack_bottom = &stack[0];
|
||||
stack_top = stack_bottom + sz;
|
||||
}
|
||||
|
||||
s32 prev_flag = 0;
|
||||
for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++)
|
||||
{
|
||||
#define CANNY_SHIFT 15
|
||||
const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
|
||||
|
||||
s32 m = _mag[j];
|
||||
|
||||
if (m > low)
|
||||
{
|
||||
s32 xs = _x[j];
|
||||
s32 ys = _y[j];
|
||||
s32 x = abs(xs);
|
||||
s32 y = abs(ys) << CANNY_SHIFT;
|
||||
|
||||
s32 tg22x = x * TG22;
|
||||
|
||||
if (y < tg22x)
|
||||
{
|
||||
if (m > _mag[j-1] && m >= _mag[j+1]) goto __push;
|
||||
}
|
||||
else
|
||||
{
|
||||
s32 tg67x = tg22x + (x << (CANNY_SHIFT+1));
|
||||
if (y > tg67x)
|
||||
{
|
||||
if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push;
|
||||
}
|
||||
else
|
||||
{
|
||||
s32 s = (xs ^ ys) < 0 ? -1 : 1;
|
||||
if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push;
|
||||
}
|
||||
}
|
||||
}
|
||||
prev_flag = 0;
|
||||
_map[j] = u8(1);
|
||||
continue;
|
||||
__push:
|
||||
if (!prev_flag && m > high && _map[j-mapstep] != 2)
|
||||
{
|
||||
CANNY_PUSH(_map + j);
|
||||
prev_flag = 1;
|
||||
}
|
||||
else
|
||||
_map[j] = 0;
|
||||
}
|
||||
|
||||
// scroll the ring buffer
|
||||
_mag = mag_buf[0];
|
||||
mag_buf[0] = mag_buf[1];
|
||||
mag_buf[1] = mag_buf[2];
|
||||
mag_buf[2] = _mag;
|
||||
}
|
||||
|
||||
// now track the edges (hysteresis thresholding)
|
||||
while (stack_top > stack_bottom)
|
||||
{
|
||||
u8* m;
|
||||
if ((size_t)(stack_top - stack_bottom) + 8u > maxsize)
|
||||
{
|
||||
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
|
||||
maxsize = maxsize * 3/2;
|
||||
stack.resize(maxsize);
|
||||
stack_bottom = &stack[0];
|
||||
stack_top = stack_bottom + sz;
|
||||
}
|
||||
|
||||
CANNY_POP(m);
|
||||
|
||||
if (!m[-1]) CANNY_PUSH(m - 1);
|
||||
if (!m[1]) CANNY_PUSH(m + 1);
|
||||
if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
|
||||
if (!m[-mapstep]) CANNY_PUSH(m - mapstep);
|
||||
if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
|
||||
if (!m[mapstep-1]) CANNY_PUSH(m + mapstep - 1);
|
||||
if (!m[mapstep]) CANNY_PUSH(m + mapstep);
|
||||
if (!m[mapstep+1]) CANNY_PUSH(m + mapstep + 1);
|
||||
}
|
||||
|
||||
// the final pass, form the final image
|
||||
uint8x16_t v2 = vmovq_n_u8(2);
|
||||
const u8* ptrmap = map + mapstep + 1;
|
||||
for (size_t i = 0; i < size.height; i++, ptrmap += mapstep)
|
||||
{
|
||||
u8* _dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
ptrdiff_t j = 0;
|
||||
for (; j < (ptrdiff_t)size.width - 16; j += 16)
|
||||
{
|
||||
internal::prefetch(ptrmap);
|
||||
uint8x16_t vmap = vld1q_u8(ptrmap + j);
|
||||
uint8x16_t vdst = vceqq_u8(vmap, v2);
|
||||
vst1q_u8(_dst+j, vdst);
|
||||
}
|
||||
for (; j < (ptrdiff_t)size.width; j++)
|
||||
_dst[j] = (u8)-(ptrmap[j] >> 1);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
bool isCanny3x3Supported(const Size2D &size)
|
||||
{
|
||||
return isSupportedConfiguration() &&
|
||||
size.height >= 2 && size.width >= 9;
|
||||
}
|
||||
|
||||
void Canny3x3L1(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
f64 low_thresh, f64 high_thresh,
|
||||
Margin borderMargin)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
|
||||
#ifdef CAROTENE_NEON
|
||||
Canny3x3<false, false>(size, 1,
|
||||
srcBase, srcStride,
|
||||
dstBase, dstStride,
|
||||
NULL, 0,
|
||||
NULL, 0,
|
||||
low_thresh, high_thresh,
|
||||
borderMargin);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)low_thresh;
|
||||
(void)high_thresh;
|
||||
(void)borderMargin;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Canny3x3L2(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
f64 low_thresh, f64 high_thresh,
|
||||
Margin borderMargin)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
|
||||
#ifdef CAROTENE_NEON
|
||||
Canny3x3<true, false>(size, 1,
|
||||
srcBase, srcStride,
|
||||
dstBase, dstStride,
|
||||
NULL, 0,
|
||||
NULL, 0,
|
||||
low_thresh, high_thresh,
|
||||
borderMargin);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)low_thresh;
|
||||
(void)high_thresh;
|
||||
(void)borderMargin;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Canny3x3L1(const Size2D &size, s32 cn,
|
||||
s16 * dxBase, ptrdiff_t dxStride,
|
||||
s16 * dyBase, ptrdiff_t dyStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
f64 low_thresh, f64 high_thresh)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Canny3x3<false, true>(size, cn,
|
||||
NULL, 0,
|
||||
dstBase, dstStride,
|
||||
dxBase, dxStride,
|
||||
dyBase, dyStride,
|
||||
low_thresh, high_thresh,
|
||||
Margin());
|
||||
#else
|
||||
(void)size;
|
||||
(void)cn;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)dxBase;
|
||||
(void)dxStride;
|
||||
(void)dyBase;
|
||||
(void)dyStride;
|
||||
(void)low_thresh;
|
||||
(void)high_thresh;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Canny3x3L2(const Size2D &size, s32 cn,
|
||||
s16 * dxBase, ptrdiff_t dxStride,
|
||||
s16 * dyBase, ptrdiff_t dyStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
f64 low_thresh, f64 high_thresh)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Canny3x3<true, true>(size, cn,
|
||||
NULL, 0,
|
||||
dstBase, dstStride,
|
||||
dxBase, dxStride,
|
||||
dyBase, dyStride,
|
||||
low_thresh, high_thresh,
|
||||
Margin());
|
||||
#else
|
||||
(void)size;
|
||||
(void)cn;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)dxBase;
|
||||
(void)dxStride;
|
||||
(void)dyBase;
|
||||
(void)dyStride;
|
||||
(void)low_thresh;
|
||||
(void)high_thresh;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
486
3rdparty/carotene/src/channel_extract.cpp
vendored
Normal file
486
3rdparty/carotene/src/channel_extract.cpp
vendored
Normal file
@ -0,0 +1,486 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
void extract2(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
u32 coi)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
#ifndef ANDROID
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
#endif
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0u; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t sj = 0u, dj = 0u;
|
||||
|
||||
#ifndef ANDROID
|
||||
for (; dj < roiw32; sj += 64, dj += 32)
|
||||
{
|
||||
internal::prefetch(src + sj);
|
||||
|
||||
uint8x16x2_t v_src = vld2q_u8(src + sj);
|
||||
vst1q_u8(dst + dj, v_src.val[coi]);
|
||||
|
||||
v_src = vld2q_u8(src + sj + 32);
|
||||
vst1q_u8(dst + dj + 16, v_src.val[coi]);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; dj < roiw8; sj += 16, dj += 8)
|
||||
{
|
||||
uint8x8x2_t v_src = vld2_u8(src + sj);
|
||||
vst1_u8(dst + dj, v_src.val[coi]);
|
||||
}
|
||||
|
||||
for (; dj < size.width; sj += 2, ++dj)
|
||||
{
|
||||
dst[dj] = src[sj + coi];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)coi;
|
||||
#endif
|
||||
}
|
||||
|
||||
void extract3(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
u32 coi)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
#ifndef ANDROID
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
#endif
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0u; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t sj = 0u, dj = 0u;
|
||||
|
||||
#ifndef ANDROID
|
||||
for (; dj < roiw32; sj += 96, dj += 32)
|
||||
{
|
||||
internal::prefetch(src + sj);
|
||||
|
||||
uint8x16x3_t v_src = vld3q_u8(src + sj);
|
||||
vst1q_u8(dst + dj, v_src.val[coi]);
|
||||
|
||||
v_src = vld3q_u8(src + sj + 48);
|
||||
vst1q_u8(dst + dj + 16, v_src.val[coi]);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; dj < roiw8; sj += 24, dj += 8)
|
||||
{
|
||||
uint8x8x3_t v_src = vld3_u8(src + sj);
|
||||
vst1_u8(dst + dj, v_src.val[coi]);
|
||||
}
|
||||
|
||||
for (; dj < size.width; sj += 3, ++dj)
|
||||
{
|
||||
dst[dj] = src[sj + coi];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)coi;
|
||||
#endif
|
||||
}
|
||||
|
||||
void extract4(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
u32 coi)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
#ifndef ANDROID
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
#endif
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0u; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t sj = 0u, dj = 0u;
|
||||
|
||||
#ifndef ANDROID
|
||||
for (; dj < roiw32; sj += 128, dj += 32)
|
||||
{
|
||||
internal::prefetch(src + sj);
|
||||
|
||||
uint8x16x4_t v_src = vld4q_u8(src + sj);
|
||||
vst1q_u8(dst + dj, v_src.val[coi]);
|
||||
|
||||
v_src = vld4q_u8(src + sj + 64);
|
||||
vst1q_u8(dst + dj + 16, v_src.val[coi]);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; dj < roiw8; sj += 32, dj += 8)
|
||||
{
|
||||
uint8x8x4_t v_src = vld4_u8(src + sj);
|
||||
vst1_u8(dst + dj, v_src.val[coi]);
|
||||
}
|
||||
|
||||
for (; dj < size.width; sj += 4, ++dj)
|
||||
{
|
||||
dst[dj] = src[sj + coi];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)coi;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define FILL_LINES2(macro,type) \
|
||||
macro##_LINE(type,0) \
|
||||
macro##_LINE(type,1)
|
||||
#define FILL_LINES3(macro,type) \
|
||||
FILL_LINES2(macro,type) \
|
||||
macro##_LINE(type,2)
|
||||
#define FILL_LINES4(macro,type) \
|
||||
FILL_LINES3(macro,type) \
|
||||
macro##_LINE(type,3)
|
||||
|
||||
#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
|
||||
#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
|
||||
#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
|
||||
#define SST_LINE(type, n) dst##n[dj] = src[sj + n];
|
||||
|
||||
#define MUL2(val) (val << 1)
|
||||
#define MUL3(val) (MUL2(val) + val)
|
||||
#define MUL4(val) (val << 2)
|
||||
|
||||
#define CONTDST2 srcStride == dst0Stride && \
|
||||
srcStride == dst1Stride &&
|
||||
#define CONTDST3 srcStride == dst0Stride && \
|
||||
srcStride == dst1Stride && \
|
||||
srcStride == dst2Stride &&
|
||||
#define CONTDST4 srcStride == dst0Stride && \
|
||||
srcStride == dst1Stride && \
|
||||
srcStride == dst2Stride && \
|
||||
srcStride == dst3Stride &&
|
||||
|
||||
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
|
||||
|
||||
#define SPLIT_ASM2(sgn, bits) __asm__ ( \
|
||||
"vld2." #bits " {d0, d2}, [%[in0]] \n\t" \
|
||||
"vld2." #bits " {d1, d3}, [%[in1]] \n\t" \
|
||||
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
|
||||
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
|
||||
: \
|
||||
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
|
||||
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
|
||||
: "d0","d1","d2","d3" \
|
||||
);
|
||||
#define SPLIT_ASM3(sgn, bits) __asm__ ( \
|
||||
"vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \
|
||||
"vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \
|
||||
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
|
||||
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
|
||||
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
|
||||
: \
|
||||
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
|
||||
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
|
||||
: "d0","d1","d2","d3","d4","d5" \
|
||||
);
|
||||
#define SPLIT_ASM4(sgn, bits) __asm__ ( \
|
||||
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
|
||||
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
|
||||
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
|
||||
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
|
||||
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
|
||||
"vst1." #bits " {d6-d7}, [%[out3]] \n\t" \
|
||||
: \
|
||||
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
|
||||
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
|
||||
: "d0","d1","d2","d3","d4","d5","d6","d7" \
|
||||
);
|
||||
|
||||
#define SPLIT_QUAD(sgn, bits, n) { \
|
||||
internal::prefetch(src + sj); \
|
||||
SPLIT_ASM##n(sgn, bits) \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define SPLIT_QUAD(sgn, bits, n) { \
|
||||
internal::prefetch(src + sj); \
|
||||
vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
|
||||
FILL_LINES##n(VST1Q, sgn##bits) \
|
||||
}
|
||||
|
||||
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
|
||||
|
||||
#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \
|
||||
const sgn##bits * srcBase, ptrdiff_t srcStride \
|
||||
FILL_LINES##n(FARG, sgn##bits) ) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
Size2D size(_size); \
|
||||
if (CONTDST##n \
|
||||
dst0Stride == (ptrdiff_t)(size.width)) \
|
||||
{ \
|
||||
size.width *= size.height; \
|
||||
size.height = 1; \
|
||||
} \
|
||||
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
|
||||
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
|
||||
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
|
||||
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
|
||||
\
|
||||
for (size_t i = 0u; i < size.height; ++i) \
|
||||
{ \
|
||||
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
|
||||
FILL_LINES##n(VROW, sgn##bits) \
|
||||
size_t sj = 0u, dj = 0u; \
|
||||
\
|
||||
for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \
|
||||
SPLIT_QUAD(sgn, bits, n) \
|
||||
\
|
||||
if (dj < roiw8) \
|
||||
{ \
|
||||
vec64 v_src = vld##n##_##sgn##bits(src + sj); \
|
||||
FILL_LINES##n(VST1, sgn##bits) \
|
||||
sj += MUL##n(8)/sizeof(sgn##bits); \
|
||||
dj += 8/sizeof(sgn##bits); \
|
||||
} \
|
||||
\
|
||||
for (; dj < size.width; sj += n, ++dj) \
|
||||
{ \
|
||||
FILL_LINES##n(SST, sgn##bits) \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \
|
||||
const sgn##64 * srcBase, ptrdiff_t srcStride \
|
||||
FILL_LINES##n(FARG, sgn##64) ) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
Size2D size(_size); \
|
||||
if (CONTDST##n \
|
||||
dst0Stride == (ptrdiff_t)(size.width)) \
|
||||
{ \
|
||||
size.width *= size.height; \
|
||||
size.height = 1; \
|
||||
} \
|
||||
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
|
||||
\
|
||||
for (size_t i = 0u; i < size.height; ++i) \
|
||||
{ \
|
||||
const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \
|
||||
FILL_LINES##n(VROW, sgn##64) \
|
||||
size_t sj = 0u, dj = 0u; \
|
||||
\
|
||||
for (; dj < size.width; sj += n, ++dj) \
|
||||
{ \
|
||||
vec64 v_src = vld##n##_##sgn##64(src + sj); \
|
||||
FILL_LINES##n(VST1, sgn##64) \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
|
||||
|
||||
#define ALPHA_QUAD(sgn, bits) { \
|
||||
internal::prefetch(src + sj); \
|
||||
__asm__ ( \
|
||||
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
|
||||
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
|
||||
"vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \
|
||||
"vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \
|
||||
"vst1." #bits " {d6-d7}, [%[out1]] \n\t" \
|
||||
: \
|
||||
: [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
|
||||
[in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \
|
||||
: "d0","d1","d2","d3","d4","d5","d6","d7" \
|
||||
); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ALPHA_QUAD(sgn, bits) { \
|
||||
internal::prefetch(src + sj); \
|
||||
union { vec128_4 v4; vec128_3 v3; } vals; \
|
||||
vals.v4 = vld4q_##sgn##bits(src + sj); \
|
||||
vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
|
||||
vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
|
||||
}
|
||||
|
||||
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
|
||||
|
||||
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \
|
||||
const sgn##bits * srcBase, ptrdiff_t srcStride, \
|
||||
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
|
||||
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
Size2D size(_size); \
|
||||
if (srcStride == dst3Stride && \
|
||||
srcStride == dst1Stride && \
|
||||
srcStride == (ptrdiff_t)(size.width)) \
|
||||
{ \
|
||||
size.width *= size.height; \
|
||||
size.height = 1; \
|
||||
} \
|
||||
typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4; \
|
||||
typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3; \
|
||||
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
|
||||
typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4; \
|
||||
typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3; \
|
||||
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
|
||||
\
|
||||
for (size_t i = 0u; i < size.height; ++i) \
|
||||
{ \
|
||||
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
|
||||
sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \
|
||||
sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \
|
||||
size_t sj = 0u, d3j = 0u, d1j = 0u; \
|
||||
\
|
||||
for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \
|
||||
d1j += 16/sizeof(sgn##bits)) \
|
||||
ALPHA_QUAD(sgn, bits) \
|
||||
\
|
||||
if (d1j < roiw8) \
|
||||
{ \
|
||||
union { vec64_4 v4; vec64_3 v3; } vals; \
|
||||
vals.v4 = vld4_##sgn##bits(src + sj); \
|
||||
vst3_u8(dst3 + d3j, vals.v3); \
|
||||
vst1_u8(dst1 + d1j, vals.v4.val[3]); \
|
||||
sj += MUL4(8)/sizeof(sgn##bits); \
|
||||
d3j += MUL3(8)/sizeof(sgn##bits); \
|
||||
d1j += 8/sizeof(sgn##bits); \
|
||||
} \
|
||||
\
|
||||
for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \
|
||||
{ \
|
||||
dst3[d3j+0] = src[sj + 0]; \
|
||||
dst3[d3j+1] = src[sj + 1]; \
|
||||
dst3[d3j+2] = src[sj + 2]; \
|
||||
dst1[d1j] = src[sj + 3]; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
|
||||
|
||||
#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \
|
||||
const sgn##bits * srcBase, ptrdiff_t srcStride \
|
||||
FILL_LINES##n(FARG, sgn##bits) ) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
(void)size; \
|
||||
(void)srcBase; \
|
||||
(void)srcStride; \
|
||||
FILL_LINES##n(VOID, sgn##bits) \
|
||||
}
|
||||
|
||||
#define SPLIT64(sgn,n) SPLIT(sgn,64,n)
|
||||
|
||||
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \
|
||||
const sgn##bits * srcBase, ptrdiff_t srcStride, \
|
||||
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
|
||||
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
(void)size; \
|
||||
(void)srcBase; \
|
||||
(void)srcStride; \
|
||||
(void)dst3Base; \
|
||||
(void)dst3Stride; \
|
||||
(void)dst1Base; \
|
||||
(void)dst1Stride; \
|
||||
}
|
||||
|
||||
#endif //CAROTENE_NEON
|
||||
|
||||
SPLIT(u, 8,2)
|
||||
SPLIT(u, 8,3)
|
||||
SPLIT(u, 8,4)
|
||||
SPLIT(u,16,2)
|
||||
SPLIT(u,16,3)
|
||||
SPLIT(u,16,4)
|
||||
SPLIT(s,32,2)
|
||||
SPLIT(s,32,3)
|
||||
SPLIT(s,32,4)
|
||||
|
||||
SPLIT64(s, 2)
|
||||
SPLIT64(s, 3)
|
||||
SPLIT64(s, 4)
|
||||
|
||||
SPLIT4ALPHA(u,8)
|
||||
|
||||
} // namespace CAROTENE_NS
|
389
3rdparty/carotene/src/channels_combine.cpp
vendored
Normal file
389
3rdparty/carotene/src/channels_combine.cpp
vendored
Normal file
@ -0,0 +1,389 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#define FILL_LINES2(macro,type) \
|
||||
macro##_LINE(type,0) \
|
||||
macro##_LINE(type,1)
|
||||
#define FILL_LINES3(macro,type) \
|
||||
FILL_LINES2(macro,type) \
|
||||
macro##_LINE(type,2)
|
||||
#define FILL_LINES4(macro,type) \
|
||||
FILL_LINES3(macro,type) \
|
||||
macro##_LINE(type,3)
|
||||
|
||||
#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
|
||||
#define PREF_LINE(type, n) internal::prefetch(src##n + sj);
|
||||
#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
|
||||
#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
|
||||
#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
|
||||
#define SLD_LINE(type, n) dst[dj + n] = src##n[sj];
|
||||
|
||||
#define MUL2(val) (val << 1)
|
||||
#define MUL3(val) (MUL2(val) + val)
|
||||
#define MUL4(val) (val << 2)
|
||||
|
||||
#define CONTSRC2 dstStride == src0Stride && \
|
||||
dstStride == src1Stride &&
|
||||
#define CONTSRC3 dstStride == src0Stride && \
|
||||
dstStride == src1Stride && \
|
||||
dstStride == src2Stride &&
|
||||
#define CONTSRC4 dstStride == src0Stride && \
|
||||
dstStride == src1Stride && \
|
||||
dstStride == src2Stride && \
|
||||
dstStride == src3Stride &&
|
||||
|
||||
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
|
||||
|
||||
#define MERGE_ASM2(sgn, bits) __asm__ ( \
|
||||
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
|
||||
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
|
||||
"vst2." #bits " {d0, d2}, [%[out0]] \n\t" \
|
||||
"vst2." #bits " {d1, d3}, [%[out1]] \n\t" \
|
||||
: \
|
||||
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
|
||||
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
|
||||
: "d0","d1","d2","d3" \
|
||||
);
|
||||
#define MERGE_ASM3(sgn, bits) __asm__ ( \
|
||||
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
|
||||
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
|
||||
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
|
||||
"vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \
|
||||
"vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \
|
||||
: \
|
||||
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
|
||||
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
|
||||
: "d0","d1","d2","d3","d4","d5" \
|
||||
);
|
||||
#define MERGE_ASM4(sgn, bits) __asm__ ( \
|
||||
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
|
||||
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
|
||||
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
|
||||
"vld1." #bits " {d6-d7}, [%[in3]] \n\t" \
|
||||
"vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \
|
||||
"vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \
|
||||
: \
|
||||
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
|
||||
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
|
||||
: "d0","d1","d2","d3","d4","d5","d6","d7" \
|
||||
);
|
||||
|
||||
#define MERGE_QUAD(sgn, bits, n) { \
|
||||
FILL_LINES##n(PREF, sgn##bits) \
|
||||
MERGE_ASM##n(sgn, bits) \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define MERGE_QUAD(sgn, bits, n) { \
|
||||
vec128 v_dst; \
|
||||
/*FILL_LINES##n(PREF, sgn##bits) \
|
||||
FILL_LINES##n(VLD1Q, sgn##bits)*/ \
|
||||
FILL_LINES##n(PRLD, sgn##bits) \
|
||||
vst##n##q_##sgn##bits(dst + dj, v_dst); \
|
||||
}
|
||||
|
||||
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
|
||||
|
||||
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \
|
||||
FILL_LINES##n(FARG, sgn##bits), \
|
||||
sgn##bits * dstBase, ptrdiff_t dstStride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
Size2D size(_size); \
|
||||
if (CONTSRC##n \
|
||||
dstStride == (ptrdiff_t)(size.width)) \
|
||||
{ \
|
||||
size.width *= size.height; \
|
||||
size.height = 1; \
|
||||
} \
|
||||
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
|
||||
size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
|
||||
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
|
||||
size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \
|
||||
\
|
||||
for (size_t i = 0u; i < size.height; ++i) \
|
||||
{ \
|
||||
FILL_LINES##n(VROW, sgn##bits) \
|
||||
sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \
|
||||
size_t sj = 0u, dj = 0u; \
|
||||
\
|
||||
for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \
|
||||
MERGE_QUAD(sgn, bits, n) \
|
||||
\
|
||||
if ( sj < roiw8 ) \
|
||||
{ \
|
||||
vec64 v_dst; \
|
||||
FILL_LINES##n(VLD1, sgn##bits) \
|
||||
vst##n##_##sgn##bits(dst + dj, v_dst); \
|
||||
sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \
|
||||
} \
|
||||
\
|
||||
for (; sj < size.width; ++sj, dj += n) \
|
||||
{ \
|
||||
FILL_LINES##n(SLD, sgn##bits) \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \
|
||||
FILL_LINES##n(FARG, sgn##64), \
|
||||
sgn##64 * dstBase, ptrdiff_t dstStride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
Size2D size(_size); \
|
||||
if (CONTSRC##n \
|
||||
dstStride == (ptrdiff_t)(size.width)) \
|
||||
{ \
|
||||
size.width *= size.height; \
|
||||
size.height = 1; \
|
||||
} \
|
||||
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
|
||||
\
|
||||
for (size_t i = 0u; i < size.height; ++i) \
|
||||
{ \
|
||||
FILL_LINES##n(VROW, sgn##64) \
|
||||
sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \
|
||||
size_t sj = 0u, dj = 0u; \
|
||||
\
|
||||
for (; sj < size.width; ++sj, dj += n) \
|
||||
{ \
|
||||
vec64 v_dst; \
|
||||
FILL_LINES##n(VLD1, sgn##64) \
|
||||
vst##n##_##sgn##64(dst + dj, v_dst); \
|
||||
/*FILL_LINES##n(SLD, sgn##64)*/ \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
|
||||
|
||||
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \
|
||||
FILL_LINES##n(FARG, sgn##bits), \
|
||||
sgn##bits * dstBase, ptrdiff_t dstStride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
(void)size; \
|
||||
FILL_LINES##n(VOID, sgn##bits) \
|
||||
(void)dstBase; \
|
||||
(void)dstStride; \
|
||||
}
|
||||
#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
|
||||
|
||||
#endif //CAROTENE_NEON
|
||||
|
||||
COMBINE(u, 8,2)
|
||||
COMBINE(u, 8,3)
|
||||
COMBINE(u, 8,4)
|
||||
COMBINE(u,16,2)
|
||||
COMBINE(u,16,3)
|
||||
COMBINE(u,16,4)
|
||||
COMBINE(s,32,2)
|
||||
COMBINE(s,32,3)
|
||||
COMBINE(s,32,4)
|
||||
COMBINE64(s, 2)
|
||||
COMBINE64(s, 3)
|
||||
COMBINE64(s, 4)
|
||||
|
||||
void combineYUYV(const Size2D &size,
|
||||
const u8 * srcyBase, ptrdiff_t srcyStride,
|
||||
const u8 * srcuBase, ptrdiff_t srcuStride,
|
||||
const u8 * srcvBase, ptrdiff_t srcvStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
#ifndef ANDROID
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
#endif
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0u; i < size.height; i += 1)
|
||||
{
|
||||
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
|
||||
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
|
||||
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t syj = 0u, sj = 0u, dj = 0u;
|
||||
|
||||
#ifndef ANDROID
|
||||
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
|
||||
{
|
||||
internal::prefetch(srcy + syj);
|
||||
internal::prefetch(srcu + sj);
|
||||
internal::prefetch(srcv + sj);
|
||||
|
||||
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
|
||||
uint8x16x4_t v_dst;
|
||||
v_dst.val[0] = v_y.val[0];
|
||||
v_dst.val[1] = vld1q_u8(srcu + sj);
|
||||
v_dst.val[2] = v_y.val[1];
|
||||
v_dst.val[3] = vld1q_u8(srcv + sj);
|
||||
vst4q_u8(dst + dj, v_dst);
|
||||
|
||||
v_y = vld2q_u8(srcy + syj + 32);
|
||||
v_dst.val[0] = v_y.val[0];
|
||||
v_dst.val[1] = vld1q_u8(srcu + sj + 16);
|
||||
v_dst.val[2] = v_y.val[1];
|
||||
v_dst.val[3] = vld1q_u8(srcv + sj + 16);
|
||||
vst4q_u8(dst + dj + 64, v_dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
|
||||
{
|
||||
uint8x8x2_t v_y = vld2_u8(srcy + syj);
|
||||
uint8x8x4_t v_dst;
|
||||
v_dst.val[0] = v_y.val[0];
|
||||
v_dst.val[1] = vld1_u8(srcu + sj);
|
||||
v_dst.val[2] = v_y.val[1];
|
||||
v_dst.val[3] = vld1_u8(srcv + sj);
|
||||
vst4_u8(dst + dj, v_dst);
|
||||
}
|
||||
|
||||
for (; sj < size.width; ++sj, syj += 2, dj += 4)
|
||||
{
|
||||
dst[dj] = srcy[syj];
|
||||
dst[dj + 1] = srcu[sj];
|
||||
dst[dj + 2] = srcy[syj + 1];
|
||||
dst[dj + 3] = srcv[sj];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcyBase;
|
||||
(void)srcyStride;
|
||||
(void)srcuBase;
|
||||
(void)srcuStride;
|
||||
(void)srcvBase;
|
||||
(void)srcvStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void combineUYVY(const Size2D &size,
|
||||
const u8 * srcyBase, ptrdiff_t srcyStride,
|
||||
const u8 * srcuBase, ptrdiff_t srcuStride,
|
||||
const u8 * srcvBase, ptrdiff_t srcvStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
#ifndef ANDROID
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
#endif
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0u; i < size.height; ++i)
|
||||
{
|
||||
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
|
||||
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
|
||||
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t syj = 0u, sj = 0u, dj = 0u;
|
||||
|
||||
#ifndef ANDROID
|
||||
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
|
||||
{
|
||||
internal::prefetch(srcy + syj);
|
||||
internal::prefetch(srcu + sj);
|
||||
internal::prefetch(srcv + sj);
|
||||
|
||||
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
|
||||
uint8x16x4_t v_dst;
|
||||
v_dst.val[0] = vld1q_u8(srcu + sj);
|
||||
v_dst.val[1] = v_y.val[0];
|
||||
v_dst.val[2] = vld1q_u8(srcv + sj);
|
||||
v_dst.val[3] = v_y.val[1];
|
||||
vst4q_u8(dst + dj, v_dst);
|
||||
|
||||
v_y = vld2q_u8(srcy + syj + 32);
|
||||
v_dst.val[0] = vld1q_u8(srcu + sj + 16);
|
||||
v_dst.val[1] = v_y.val[0];
|
||||
v_dst.val[2] = vld1q_u8(srcv + sj + 16);
|
||||
v_dst.val[3] = v_y.val[1];
|
||||
vst4q_u8(dst + dj + 64, v_dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
|
||||
{
|
||||
uint8x8x2_t v_y = vld2_u8(srcy + syj);
|
||||
uint8x8x4_t v_dst;
|
||||
v_dst.val[0] = vld1_u8(srcu + sj);
|
||||
v_dst.val[1] = v_y.val[0];
|
||||
v_dst.val[2] = vld1_u8(srcv + sj);
|
||||
v_dst.val[3] = v_y.val[1];
|
||||
vst4_u8(dst + dj, v_dst);
|
||||
}
|
||||
|
||||
for (; sj < size.width; ++sj, syj += 2, dj += 4)
|
||||
{
|
||||
dst[dj] = srcu[sj];
|
||||
dst[dj + 1] = srcy[syj];
|
||||
dst[dj + 2] = srcv[sj];
|
||||
dst[dj + 3] = srcy[syj + 1];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcyBase;
|
||||
(void)srcyStride;
|
||||
(void)srcuBase;
|
||||
(void)srcuStride;
|
||||
(void)srcvBase;
|
||||
(void)srcvStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
340
3rdparty/carotene/src/cmp.cpp
vendored
Normal file
340
3rdparty/carotene/src/cmp.cpp
vendored
Normal file
@ -0,0 +1,340 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
|
||||
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
|
||||
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
|
||||
|
||||
template <typename Op, int elsize> struct vtail
|
||||
{
|
||||
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
|
||||
u8 * dst, const Op & op,
|
||||
size_t &x, size_t width)
|
||||
{
|
||||
//do nothing since there couldn't be enough data
|
||||
(void)src0;
|
||||
(void)src1;
|
||||
(void)dst;
|
||||
(void)op;
|
||||
(void)x;
|
||||
(void)width;
|
||||
}
|
||||
};
|
||||
template <typename Op> struct vtail<Op, 2>
|
||||
{
|
||||
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
|
||||
u8 * dst, const Op & op,
|
||||
size_t &x, size_t width)
|
||||
{
|
||||
typedef typename Op::type type;
|
||||
typedef typename internal::VecTraits<type>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
|
||||
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
|
||||
if( x + 8 < width)
|
||||
{
|
||||
vec128 v_src0, v_src1;
|
||||
uvec128 v_dst;
|
||||
|
||||
v_src0 = internal::vld1q(src0 + x);
|
||||
v_src1 = internal::vld1q(src1 + x);
|
||||
op(v_src0, v_src1, v_dst);
|
||||
internal::vst1(dst + x, internal::vmovn(v_dst));
|
||||
x+=8;
|
||||
}
|
||||
}
|
||||
};
|
||||
template <typename Op> struct vtail<Op, 1>
|
||||
{
|
||||
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
|
||||
u8 * dst, const Op & op,
|
||||
size_t &x, size_t width)
|
||||
{
|
||||
typedef typename Op::type type;
|
||||
typedef typename internal::VecTraits<type>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
|
||||
typedef typename internal::VecTraits<type>::vec64 vec64;
|
||||
typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
|
||||
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
|
||||
if( x + 16 < width)
|
||||
{
|
||||
vec128 v_src0, v_src1;
|
||||
uvec128 v_dst;
|
||||
|
||||
v_src0 = internal::vld1q(src0 + x);
|
||||
v_src1 = internal::vld1q(src1 + x);
|
||||
op(v_src0, v_src1, v_dst);
|
||||
internal::vst1q(dst + x, v_dst);
|
||||
x+=16;
|
||||
}
|
||||
if( x + 8 < width)
|
||||
{
|
||||
vec64 v_src0, v_src1;
|
||||
uvec64 v_dst;
|
||||
|
||||
v_src0 = internal::vld1(src0 + x);
|
||||
v_src1 = internal::vld1(src1 + x);
|
||||
op(v_src0, v_src1, v_dst);
|
||||
internal::vst1(dst + x, v_dst);
|
||||
x+=8;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Op>
|
||||
void vcompare(Size2D size,
|
||||
const typename Op::type * src0Base, ptrdiff_t src0Stride,
|
||||
const typename Op::type * src1Base, ptrdiff_t src1Stride,
|
||||
u8 * dstBase, ptrdiff_t dstStride, const Op & op)
|
||||
{
|
||||
typedef typename Op::type type;
|
||||
typedef typename internal::VecTraits<type>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
|
||||
|
||||
if (src0Stride == src1Stride && src0Stride == dstStride &&
|
||||
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
|
||||
const u32 step_base = 32 / sizeof(type);
|
||||
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
|
||||
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
|
||||
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
|
||||
size_t x = 0;
|
||||
|
||||
for( ; x < roiw_base; x += step_base )
|
||||
{
|
||||
internal::prefetch(src0 + x);
|
||||
internal::prefetch(src1 + x);
|
||||
|
||||
vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
|
||||
vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
|
||||
uvec128 v_dst0;
|
||||
uvec128 v_dst1;
|
||||
|
||||
op(v_src00, v_src10, v_dst0);
|
||||
op(v_src01, v_src11, v_dst1);
|
||||
|
||||
vnst(dst + x, v_dst0, v_dst1);
|
||||
}
|
||||
|
||||
vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);
|
||||
|
||||
for (; x < size.width; ++x)
|
||||
{
|
||||
op(src0 + x, src1 + x, dst + x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
struct OpCmpEQ
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vceqq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vceq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = src0[0] == src1[0] ? 255 : 0;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct OpCmpNE
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = src0[0] == src1[0] ? 0 : 255;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct OpCmpGT
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vcgtq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vcgt(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = src0[0] > src1[0] ? 255 : 0;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct OpCmpGE
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vcgeq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vcge(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, u8 * dst) const
|
||||
{
|
||||
dst[0] = src0[0] >= src1[0] ? 255 : 0;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#define IMPL_CMPOP(op, type) \
|
||||
void cmp##op(const Size2D &size, \
|
||||
const type * src0Base, ptrdiff_t src0Stride, \
|
||||
const type * src1Base, ptrdiff_t src1Stride, \
|
||||
u8 *dstBase, ptrdiff_t dstStride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
vcompare(size, \
|
||||
src0Base, src0Stride, \
|
||||
src1Base, src1Stride, \
|
||||
dstBase, dstStride, \
|
||||
OpCmp##op<type>()); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define IMPL_CMPOP(op, type) \
|
||||
void cmp##op(const Size2D &size, \
|
||||
const type * src0Base, ptrdiff_t src0Stride, \
|
||||
const type * src1Base, ptrdiff_t src1Stride, \
|
||||
u8 *dstBase, ptrdiff_t dstStride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
(void)size; \
|
||||
(void)src0Base; \
|
||||
(void)src0Stride; \
|
||||
(void)src1Base; \
|
||||
(void)src1Stride; \
|
||||
(void)dstBase; \
|
||||
(void)dstStride; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
IMPL_CMPOP(EQ, u8)
|
||||
IMPL_CMPOP(EQ, s8)
|
||||
IMPL_CMPOP(EQ, u16)
|
||||
IMPL_CMPOP(EQ, s16)
|
||||
IMPL_CMPOP(EQ, u32)
|
||||
IMPL_CMPOP(EQ, s32)
|
||||
IMPL_CMPOP(EQ, f32)
|
||||
|
||||
IMPL_CMPOP(NE, u8)
|
||||
IMPL_CMPOP(NE, s8)
|
||||
IMPL_CMPOP(NE, u16)
|
||||
IMPL_CMPOP(NE, s16)
|
||||
IMPL_CMPOP(NE, u32)
|
||||
IMPL_CMPOP(NE, s32)
|
||||
IMPL_CMPOP(NE, f32)
|
||||
|
||||
IMPL_CMPOP(GT, u8)
|
||||
IMPL_CMPOP(GT, s8)
|
||||
IMPL_CMPOP(GT, u16)
|
||||
IMPL_CMPOP(GT, s16)
|
||||
IMPL_CMPOP(GT, u32)
|
||||
IMPL_CMPOP(GT, s32)
|
||||
IMPL_CMPOP(GT, f32)
|
||||
|
||||
IMPL_CMPOP(GE, u8)
|
||||
IMPL_CMPOP(GE, s8)
|
||||
IMPL_CMPOP(GE, u16)
|
||||
IMPL_CMPOP(GE, s16)
|
||||
IMPL_CMPOP(GE, u32)
|
||||
IMPL_CMPOP(GE, s32)
|
||||
IMPL_CMPOP(GE, f32)
|
||||
|
||||
} // namespace CAROTENE_NS
|
2846
3rdparty/carotene/src/colorconvert.cpp
vendored
Normal file
2846
3rdparty/carotene/src/colorconvert.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
108
3rdparty/carotene/src/common.cpp
vendored
Normal file
108
3rdparty/carotene/src/common.cpp
vendored
Normal file
@ -0,0 +1,108 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isSupportedConfiguration()
|
||||
{
|
||||
#ifdef CAROTENE_NEON
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
void assertSupportedConfiguration(bool parametersSupported)
|
||||
{
|
||||
if (!isSupportedConfiguration()) {
|
||||
std::cerr << "internal error: attempted to use an unavailable function" << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
if (!parametersSupported) {
|
||||
std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin)
|
||||
{
|
||||
ptrdiff_t p = _p + (ptrdiff_t)startMargin;
|
||||
size_t len = _len + startMargin + endMargin;
|
||||
if( (size_t)p < len )
|
||||
return _p;
|
||||
else if( borderType == BORDER_MODE_REPLICATE )
|
||||
p = p < 0 ? 0 : (ptrdiff_t)len - 1;
|
||||
else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 )
|
||||
{
|
||||
s32 delta = borderType == BORDER_MODE_REFLECT101;
|
||||
if( len == 1 )
|
||||
return 0;
|
||||
do
|
||||
{
|
||||
if( p < 0 )
|
||||
p = -p - 1 + delta;
|
||||
else
|
||||
p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta;
|
||||
}
|
||||
while( (size_t)p >= len );
|
||||
}
|
||||
else if( borderType == BORDER_MODE_WRAP )
|
||||
{
|
||||
if( p < 0 )
|
||||
p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len;
|
||||
if( p >= (ptrdiff_t)len )
|
||||
p %= (ptrdiff_t)len;
|
||||
}
|
||||
else if( borderType == BORDER_MODE_CONSTANT )
|
||||
p = -1;
|
||||
else
|
||||
internal::assertSupportedConfiguration(false);
|
||||
return p - (ptrdiff_t)startMargin;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace CAROTENE_NS
|
97
3rdparty/carotene/src/common.hpp
vendored
Normal file
97
3rdparty/carotene/src/common.hpp
vendored
Normal file
@ -0,0 +1,97 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#ifndef CAROTENE_SRC_COMMON_HPP
|
||||
#define CAROTENE_SRC_COMMON_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <algorithm>
|
||||
|
||||
#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON)
|
||||
#define CAROTENE_NEON
|
||||
#endif
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
#include <arm_neon.h>
|
||||
#include "intrinsics.hpp"
|
||||
#endif
|
||||
|
||||
#include <carotene/functions.hpp>
|
||||
#include "saturate_cast.hpp"
|
||||
|
||||
namespace CAROTENE_NS { namespace internal {
|
||||
|
||||
inline void prefetch(const void *ptr, size_t offset = 32*10)
|
||||
{
|
||||
#if defined __GNUC__
|
||||
__builtin_prefetch(reinterpret_cast<const char*>(ptr) + offset);
|
||||
#elif defined _MSC_VER && defined CAROTENE_NEON
|
||||
__prefetch(reinterpret_cast<const char*>(ptr) + offset);
|
||||
#else
|
||||
(void)ptr;
|
||||
(void)offset;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row)
|
||||
{
|
||||
char *baseRaw = const_cast<char *>(reinterpret_cast<const char *>(base));
|
||||
return reinterpret_cast<T *>(baseRaw + ptrdiff_t(row) * stride);
|
||||
}
|
||||
|
||||
void assertSupportedConfiguration(bool parametersSupported = true);
|
||||
|
||||
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0);
|
||||
|
||||
/*!
|
||||
* Aligns pointer by the certain number of bytes
|
||||
*
|
||||
* This small inline function aligns the pointer by the certain number of bytes by shifting
|
||||
* it forward by 0 or a positive offset.
|
||||
*/
|
||||
template<typename T> inline T* alignPtr(T* ptr, size_t n=sizeof(T))
|
||||
{
|
||||
return (T*)(((size_t)ptr + n-1) & -n);
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
1331
3rdparty/carotene/src/convert.cpp
vendored
Normal file
1331
3rdparty/carotene/src/convert.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
399
3rdparty/carotene/src/convert_depth.cpp
vendored
Normal file
399
3rdparty/carotene/src/convert_depth.cpp
vendored
Normal file
@ -0,0 +1,399 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <int shift>
|
||||
void lshiftConst(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
uint8x16_t v_src = vld1q_u8(src + j);
|
||||
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
|
||||
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
|
||||
|
||||
vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
|
||||
vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
|
||||
vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = ((s16)src[j] << shift);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void lshiftConst<0>(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
uint8x16_t v_src = vld1q_u8(src + j);
|
||||
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
|
||||
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
|
||||
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = (s16)src[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int shift>
|
||||
void rshiftConst(const Size2D &size,
|
||||
const s16 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
if (cpolicy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
|
||||
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
|
||||
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
|
||||
vqmovun_s16(v_src1));
|
||||
vst1q_u8(dst + j, v_dst);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
|
||||
vst1_u8(dst + j, vqmovun_s16(v_src));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
|
||||
}
|
||||
}
|
||||
else // CONVERT_POLICY_WRAP
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
|
||||
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
|
||||
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
|
||||
vmovn_s16(v_src1));
|
||||
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
|
||||
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = (u8)((src[j] >> shift));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void rshiftConst<0>(const Size2D &size,
|
||||
const s16 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
if (cpolicy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
|
||||
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
|
||||
vst1q_u8(dst + j, v_dst);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src = vld1q_s16(src + j);
|
||||
vst1_u8(dst + j, vqmovun_s16(v_src));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = internal::saturate_cast<u8>(src[j]);
|
||||
}
|
||||
}
|
||||
else // CONVERT_POLICY_WRAP
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
|
||||
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
|
||||
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src = vld1q_s16(src + j);
|
||||
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = (u8)src[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (* lshiftConstFunc)(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride);
|
||||
|
||||
typedef void (* rshiftConstFunc)(const Size2D &size,
|
||||
const s16 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY cpolicy);
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void lshift(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
u32 shift)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
if (shift >= 16u)
|
||||
{
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
std::memset(dst, 0, sizeof(s16) * size.width);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// this ugly contruction is needed to avoid:
|
||||
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
|
||||
// return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
|
||||
|
||||
lshiftConstFunc funcs[16] =
|
||||
{
|
||||
lshiftConst<0>,
|
||||
lshiftConst<1>,
|
||||
lshiftConst<2>,
|
||||
lshiftConst<3>,
|
||||
lshiftConst<4>,
|
||||
lshiftConst<5>,
|
||||
lshiftConst<6>,
|
||||
lshiftConst<7>,
|
||||
lshiftConst<8>,
|
||||
lshiftConst<9>,
|
||||
lshiftConst<10>,
|
||||
lshiftConst<11>,
|
||||
lshiftConst<12>,
|
||||
lshiftConst<13>,
|
||||
lshiftConst<14>,
|
||||
lshiftConst<15>
|
||||
}, func = funcs[shift];
|
||||
|
||||
func(size, srcBase, srcStride, dstBase, dstStride);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)shift;
|
||||
#endif
|
||||
}
|
||||
|
||||
void rshift(const Size2D &size,
|
||||
const s16 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
u32 shift, CONVERT_POLICY cpolicy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
if (shift >= 16)
|
||||
{
|
||||
if (cpolicy == CONVERT_POLICY_WRAP)
|
||||
{
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
int16x8_t v_zero = vdupq_n_s16(0);
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
|
||||
uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
|
||||
vmovn_u16(vcltq_s16(v_src1, v_zero)));
|
||||
vst1q_u8(dst + j, v_dst);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src = vld1q_s16(src + j);
|
||||
vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src[j] >= 0 ? 0 : 255;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
std::memset(dst, 0, sizeof(u8) * size.width);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// this ugly contruction is needed to avoid:
|
||||
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
|
||||
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
|
||||
|
||||
rshiftConstFunc funcs[16] =
|
||||
{
|
||||
rshiftConst<0>,
|
||||
rshiftConst<1>,
|
||||
rshiftConst<2>,
|
||||
rshiftConst<3>,
|
||||
rshiftConst<4>,
|
||||
rshiftConst<5>,
|
||||
rshiftConst<6>,
|
||||
rshiftConst<7>,
|
||||
rshiftConst<8>,
|
||||
rshiftConst<9>,
|
||||
rshiftConst<10>,
|
||||
rshiftConst<11>,
|
||||
rshiftConst<12>,
|
||||
rshiftConst<13>,
|
||||
rshiftConst<14>,
|
||||
rshiftConst<15>
|
||||
}, func = funcs[shift];
|
||||
|
||||
func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)shift;
|
||||
(void)cpolicy;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
2498
3rdparty/carotene/src/convert_scale.cpp
vendored
Normal file
2498
3rdparty/carotene/src/convert_scale.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
340
3rdparty/carotene/src/convolution.cpp
vendored
Normal file
340
3rdparty/carotene/src/convolution.cpp
vendored
Normal file
@ -0,0 +1,340 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "saturate_cast.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
|
||||
BORDER_MODE border)
|
||||
{
|
||||
return isSupportedConfiguration() && size.width >= 8 &&
|
||||
(border == BORDER_MODE_CONSTANT ||
|
||||
border == BORDER_MODE_REPLICATE) &&
|
||||
(ksize.width == 3) && (ksize.height == 3);
|
||||
}
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <int shift>
|
||||
int32x4_t vshrq_s32(int32x4_t value)
|
||||
{
|
||||
return vshrq_n_s32(value, shift);
|
||||
}
|
||||
|
||||
template <>
|
||||
int32x4_t vshrq_s32<0>(int32x4_t value)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
|
||||
|
||||
#endif
|
||||
|
||||
void convolution(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE border, u8 borderValue,
|
||||
const Size2D & ksize, s16 * kernelBase, u32 scale)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
|
||||
#ifdef CAROTENE_NEON
|
||||
const uint8x8_t v_zero_u8 = vdup_n_u8(0);
|
||||
const uint8x8_t v_border = vdup_n_u8(borderValue);
|
||||
const int32x4_t v_zero_s32 = vdupq_n_s32(0);
|
||||
|
||||
uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
|
||||
tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
|
||||
tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
|
||||
uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
|
||||
|
||||
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
|
||||
static const vshrq_s32_func vshrq_s32_a[33] =
|
||||
{
|
||||
vshrq_s32<0>,
|
||||
vshrq_s32<1>,
|
||||
vshrq_s32<2>,
|
||||
vshrq_s32<3>,
|
||||
vshrq_s32<4>,
|
||||
vshrq_s32<5>,
|
||||
vshrq_s32<6>,
|
||||
vshrq_s32<7>,
|
||||
vshrq_s32<8>,
|
||||
vshrq_s32<9>,
|
||||
vshrq_s32<10>,
|
||||
vshrq_s32<11>,
|
||||
vshrq_s32<12>,
|
||||
vshrq_s32<13>,
|
||||
vshrq_s32<14>,
|
||||
vshrq_s32<15>,
|
||||
vshrq_s32<16>,
|
||||
vshrq_s32<17>,
|
||||
vshrq_s32<18>,
|
||||
vshrq_s32<19>,
|
||||
vshrq_s32<20>,
|
||||
vshrq_s32<21>,
|
||||
vshrq_s32<22>,
|
||||
vshrq_s32<23>,
|
||||
vshrq_s32<24>,
|
||||
vshrq_s32<25>,
|
||||
vshrq_s32<26>,
|
||||
vshrq_s32<27>,
|
||||
vshrq_s32<28>,
|
||||
vshrq_s32<29>,
|
||||
vshrq_s32<30>,
|
||||
vshrq_s32<31>,
|
||||
vshrq_s32<32>
|
||||
};
|
||||
vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
|
||||
|
||||
for (ptrdiff_t y = 0; y < height; ++y)
|
||||
{
|
||||
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
|
||||
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
|
||||
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
|
||||
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
u8 prevx[3] = { 0, 0, 0 },
|
||||
currx[3] = { 0, 0, 0 },
|
||||
nextx[3] = { 0, 0, 0 };
|
||||
ptrdiff_t x = 0;
|
||||
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
|
||||
|
||||
// perform vertical convolution
|
||||
for ( ; x <= bwidth; x += 8)
|
||||
{
|
||||
internal::prefetch(srow0 + x);
|
||||
internal::prefetch(srow1 + x);
|
||||
internal::prefetch(srow2 + x);
|
||||
|
||||
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
|
||||
uint8x8_t x1 = vld1_u8(srow1 + x);
|
||||
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
|
||||
|
||||
// calculate values for plain CPU part below if needed
|
||||
if (x + 8 >= bwidth)
|
||||
{
|
||||
ptrdiff_t x3 = x == width ? width - 1 : x;
|
||||
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
|
||||
|
||||
if (border == BORDER_MODE_CONSTANT && x4 < 0)
|
||||
prevx[0] = prevx[1] = prevx[2] = borderValue;
|
||||
else
|
||||
{
|
||||
prevx[0] = srow0 ? srow0[x4] : borderValue;
|
||||
prevx[1] = srow1[x4] ;
|
||||
prevx[2] = srow2 ? srow2[x4] : borderValue;
|
||||
}
|
||||
|
||||
currx[0] = srow0 ? srow0[x3] : borderValue;
|
||||
currx[1] = srow1[x3] ;
|
||||
currx[2] = srow2 ? srow2[x3] : borderValue;
|
||||
}
|
||||
|
||||
// make shift
|
||||
if (x)
|
||||
{
|
||||
tprev[0] = tcurr[0];
|
||||
tcurr[0] = tnext[0];
|
||||
|
||||
tprev[1] = tcurr[1];
|
||||
tcurr[1] = tnext[1];
|
||||
|
||||
tprev[2] = tcurr[2];
|
||||
tcurr[2] = tnext[2];
|
||||
}
|
||||
|
||||
tnext[0] = x0;
|
||||
tnext[1] = x1;
|
||||
tnext[2] = x2;
|
||||
|
||||
// make extrapolation for the first elements
|
||||
if (!x)
|
||||
{
|
||||
// make border
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
tcurr[0] = tcurr[1] = tcurr[2] = v_border;
|
||||
else if (border == BORDER_MODE_REPLICATE)
|
||||
{
|
||||
tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
|
||||
tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
|
||||
tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
|
||||
|
||||
{
|
||||
// combine 3 "shifted" vectors
|
||||
t0 = vext_u8(tprev[0], tcurr[0], 7);
|
||||
t1 = tcurr[0];
|
||||
t2 = vext_u8(tcurr[0], tnext[0], 1);
|
||||
|
||||
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
|
||||
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
|
||||
}
|
||||
|
||||
{
|
||||
// combine 3 "shifted" vectors
|
||||
t0 = vext_u8(tprev[1], tcurr[1], 7);
|
||||
t1 = tcurr[1];
|
||||
t2 = vext_u8(tcurr[1], tnext[1], 1);
|
||||
|
||||
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
|
||||
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
|
||||
}
|
||||
|
||||
{
|
||||
// combine 3 "shifted" vectors
|
||||
t0 = vext_u8(tprev[2], tcurr[2], 7);
|
||||
t1 = tcurr[2];
|
||||
t2 = vext_u8(tcurr[2], tnext[2], 1);
|
||||
|
||||
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
|
||||
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
|
||||
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
|
||||
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
|
||||
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
|
||||
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
|
||||
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
|
||||
}
|
||||
|
||||
|
||||
// make scale
|
||||
v_dst0 = vshrq_s32_p(v_dst0);
|
||||
v_dst1 = vshrq_s32_p(v_dst1);
|
||||
|
||||
// and add them
|
||||
vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
|
||||
vqmovun_s32(v_dst1))));
|
||||
}
|
||||
|
||||
x -= 8;
|
||||
if (x == width)
|
||||
--x;
|
||||
|
||||
for ( ; x < width; ++x)
|
||||
{
|
||||
// make extrapolation for the last elements
|
||||
if (x + 1 >= width)
|
||||
{
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
nextx[0] = borderValue;
|
||||
nextx[1] = borderValue;
|
||||
nextx[2] = borderValue;
|
||||
}
|
||||
else if (border == BORDER_MODE_REPLICATE)
|
||||
{
|
||||
nextx[0] = srow0[x];
|
||||
nextx[1] = srow1[x];
|
||||
nextx[2] = srow2[x];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
nextx[0] = srow0 ? srow0[x + 1] : borderValue;
|
||||
nextx[1] = srow1[x + 1] ;
|
||||
nextx[2] = srow2 ? srow2[x + 1] : borderValue;
|
||||
}
|
||||
|
||||
s32 val = 0;
|
||||
for (s32 _y = 0; _y < 3; ++_y)
|
||||
val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
|
||||
currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
|
||||
nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
|
||||
|
||||
drow[x] = internal::saturate_cast<u8>(val >> scale);
|
||||
|
||||
// make shift
|
||||
prevx[0] = currx[0];
|
||||
currx[0] = nextx[0];
|
||||
|
||||
prevx[1] = currx[1];
|
||||
currx[1] = nextx[1];
|
||||
|
||||
prevx[2] = currx[2];
|
||||
currx[2] = nextx[2];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)border;
|
||||
(void)borderValue;
|
||||
(void)ksize;
|
||||
(void)kernelBase;
|
||||
(void)scale;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
430
3rdparty/carotene/src/count_nonzero.cpp
vendored
Normal file
430
3rdparty/carotene/src/count_nonzero.cpp
vendored
Normal file
@ -0,0 +1,430 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <limits>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
s32 countNonZero(const Size2D &_size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (srcStride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
size_t roiw16 = size.width & ~15u;
|
||||
s32 result = 0;
|
||||
for(size_t k = 0; k < size.height; ++k)
|
||||
{
|
||||
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
|
||||
size_t i = 0;
|
||||
|
||||
#define COUNTNONZERO8U_BLOCK_SIZE (16*255)
|
||||
uint8x16_t vc1 = vmovq_n_u8(1);
|
||||
for (; i < roiw16;)
|
||||
{
|
||||
size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
|
||||
uint8x16_t vs = vmovq_n_u8(0);
|
||||
|
||||
for (; i <= lim; i+= 16)
|
||||
{
|
||||
internal::prefetch(src + i);
|
||||
uint8x16_t vln = vld1q_u8(src + i);
|
||||
uint8x16_t vnz = vminq_u8(vln, vc1);
|
||||
vs = vaddq_u8(vs, vnz);
|
||||
}
|
||||
|
||||
uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
|
||||
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
|
||||
|
||||
s32 s[2];
|
||||
vst1_u32((u32*)s, vs2);
|
||||
|
||||
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
result += (s[0] += s[1]);
|
||||
if (s[0] < 0 || result < 0)
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
}
|
||||
for (; i < size.width; i++)
|
||||
result += (src[i] != 0)?1:0;
|
||||
if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
(void)_size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
s32 countNonZero(const Size2D &_size,
|
||||
const u16 * srcBase, ptrdiff_t srcStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (srcStride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
size_t roiw8 = size.width & ~7u;
|
||||
s32 result = 0;
|
||||
for(size_t k = 0; k < size.height; ++k)
|
||||
{
|
||||
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
|
||||
size_t i = 0;
|
||||
|
||||
#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
|
||||
uint16x8_t vc1 = vmovq_n_u16(1);
|
||||
for (; i < roiw8;)
|
||||
{
|
||||
size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
|
||||
uint16x8_t vs = vmovq_n_u16(0);
|
||||
|
||||
for (; i <= lim; i+= 8)
|
||||
{
|
||||
internal::prefetch(src + i);
|
||||
uint16x8_t vln = vld1q_u16(src + i);
|
||||
uint16x8_t vnz = vminq_u16(vln, vc1);
|
||||
vs = vaddq_u16(vs, vnz);
|
||||
}
|
||||
|
||||
uint32x4_t vs4 = vpaddlq_u16(vs);
|
||||
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
|
||||
|
||||
s32 s[2];
|
||||
vst1_u32((u32*)s, vs2);
|
||||
|
||||
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
result += (s[0] += s[1]);
|
||||
if (s[0] < 0 || result < 0)
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
}
|
||||
for (; i < size.width; i++)
|
||||
result += (src[i] != 0)?1:0;
|
||||
if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
(void)_size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
s32 countNonZero(const Size2D &_size,
|
||||
const s32 * srcBase, ptrdiff_t srcStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (srcStride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
size_t roiw4 = size.width & ~3u;
|
||||
s32 result = 0;
|
||||
for(size_t k = 0; k < size.height; ++k)
|
||||
{
|
||||
const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);
|
||||
u32 i = 0;
|
||||
|
||||
uint32x4_t vc1 = vmovq_n_u32(1);
|
||||
uint32x4_t vs = vmovq_n_u32(0);
|
||||
|
||||
for (; i < roiw4; i += 4 )
|
||||
{
|
||||
internal::prefetch(src + i);
|
||||
uint32x4_t vln = vld1q_u32(src + i);
|
||||
uint32x4_t vnz = vminq_u32(vln, vc1);
|
||||
vs = vqaddq_u32(vs, vnz);
|
||||
}
|
||||
|
||||
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
|
||||
|
||||
s32 s[2];
|
||||
vst1_u32((u32*)s, vs2);
|
||||
|
||||
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
result += (s[0] += s[1]);
|
||||
if (s[0] < 0 || result < 0)
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
|
||||
for (; i < size.width; i++)
|
||||
result += (src[i] != 0)?1:0;
|
||||
if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
(void)_size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
s32 countNonZero(const Size2D &_size,
|
||||
const f32 * srcBase, ptrdiff_t srcStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (srcStride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
size_t roiw4 = size.width & ~3u;
|
||||
s32 result = 0;
|
||||
for(size_t k = 0; k < size.height; ++k)
|
||||
{
|
||||
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
|
||||
size_t i = 0;
|
||||
|
||||
float32x4_t vc0 = vmovq_n_f32(0);
|
||||
int32x4_t vs = vmovq_n_s32(0);
|
||||
|
||||
for (; i < roiw4; i += 4 )
|
||||
{
|
||||
internal::prefetch(src + i);
|
||||
float32x4_t vln = vld1q_f32(src + i);
|
||||
int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
|
||||
vs = vqaddq_s32(vs, vnz);
|
||||
}
|
||||
|
||||
int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
|
||||
|
||||
int s[2];
|
||||
vst1_s32(s, vs2);
|
||||
|
||||
result += (s[0] += s[1]);
|
||||
if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
|
||||
for (; i < size.width; i++)
|
||||
result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
|
||||
|
||||
if (result < 0)
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
(void)_size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
s32 countNonZero(const Size2D &_size,
|
||||
const f64 * srcBase, ptrdiff_t srcStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (srcStride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
size_t roiw8 = size.width & ~7u;
|
||||
size_t roiw4 = size.width & ~3u;
|
||||
size_t roiw2 = size.width & ~1u;
|
||||
uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
|
||||
uint32x4_t vc0 = vmovq_n_u32(0);
|
||||
|
||||
s32 result = 0;
|
||||
for(size_t k = 0; k < size.height; ++k)
|
||||
{
|
||||
const f64* src = internal::getRowPtr( srcBase, srcStride, k);
|
||||
size_t i = 0;
|
||||
|
||||
int32x2_t vs1 = vmov_n_s32(0);
|
||||
int32x2_t vs2 = vmov_n_s32(0);
|
||||
int32x2_t vs3 = vmov_n_s32(0);
|
||||
int32x2_t vs4 = vmov_n_s32(0);
|
||||
|
||||
for (; i < roiw8; i += 8 )
|
||||
{
|
||||
internal::prefetch(src + i + 6);
|
||||
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
|
||||
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
|
||||
uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
|
||||
uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
|
||||
|
||||
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
|
||||
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
|
||||
uint64x2_t vm3 = vandq_u64(vln3, vmask1);
|
||||
uint64x2_t vm4 = vandq_u64(vln4, vmask1);
|
||||
|
||||
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
|
||||
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
|
||||
uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
|
||||
uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
|
||||
|
||||
uint32x4_t vlx1 = vmvnq_u32(vequ1);
|
||||
uint32x4_t vlx2 = vmvnq_u32(vequ2);
|
||||
uint32x4_t vlx3 = vmvnq_u32(vequ3);
|
||||
uint32x4_t vlx4 = vmvnq_u32(vequ4);
|
||||
|
||||
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
|
||||
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
|
||||
int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
|
||||
int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
|
||||
|
||||
vs1 = vqadd_s32(vs1, vnz1);
|
||||
vs2 = vqadd_s32(vs2, vnz2);
|
||||
vs3 = vqadd_s32(vs3, vnz3);
|
||||
vs4 = vqadd_s32(vs4, vnz4);
|
||||
}
|
||||
|
||||
if (i < roiw4)
|
||||
{
|
||||
internal::prefetch(src + i + 2);
|
||||
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
|
||||
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
|
||||
|
||||
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
|
||||
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
|
||||
|
||||
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
|
||||
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
|
||||
|
||||
uint32x4_t vlx1 = vmvnq_u32(vequ1);
|
||||
uint32x4_t vlx2 = vmvnq_u32(vequ2);
|
||||
|
||||
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
|
||||
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
|
||||
|
||||
vs1 = vqadd_s32(vs1, vnz1);
|
||||
vs2 = vqadd_s32(vs2, vnz2);
|
||||
i += 4;
|
||||
}
|
||||
|
||||
if (i < roiw2)
|
||||
{
|
||||
internal::prefetch(src + i);
|
||||
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
|
||||
|
||||
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
|
||||
|
||||
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
|
||||
|
||||
uint32x4_t vlx1 = vmvnq_u32(vequ1);
|
||||
|
||||
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
|
||||
|
||||
vs1 = vqadd_s32(vs1, vnz1);
|
||||
i += 2;
|
||||
}
|
||||
|
||||
vs1 = vqadd_s32(vs1, vs2);
|
||||
vs3 = vqadd_s32(vs3, vs4);
|
||||
vs1 = vqadd_s32(vs1, vs3);
|
||||
int32x2_t vsneg = vqneg_s32(vs1);
|
||||
|
||||
s32 s[2];
|
||||
vst1_s32(s, vsneg);
|
||||
|
||||
result += (s[0] += s[1]);
|
||||
if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
|
||||
for (; i < size.width; i++)
|
||||
result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
|
||||
if (result < 0)
|
||||
{
|
||||
return 0x7fFFffFF;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
(void)_size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
708
3rdparty/carotene/src/div.cpp
vendored
Normal file
708
3rdparty/carotene/src/div.cpp
vendored
Normal file
@ -0,0 +1,708 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
#include <cstring>
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
namespace {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
inline float32x4_t vroundq(const float32x4_t& v)
|
||||
{
|
||||
const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
|
||||
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
|
||||
return vaddq_f32(v, v_addition);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
|
||||
{
|
||||
return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
|
||||
internal::vmovl(internal::vget_low(v2)), scale)),
|
||||
internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
|
||||
internal::vmovl(internal::vget_high(v2)), scale))
|
||||
);
|
||||
}
|
||||
template <>
|
||||
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
|
||||
{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
|
||||
template <>
|
||||
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
|
||||
{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
|
||||
|
||||
inline float32x2_t vround(const float32x2_t& v)
|
||||
{
|
||||
const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
|
||||
float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
|
||||
return vadd_f32(v, v_addition);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T divSaturate(const T &v1, const T &v2, const float scale)
|
||||
{
|
||||
return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
|
||||
}
|
||||
template <>
|
||||
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
|
||||
{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
|
||||
template <>
|
||||
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
|
||||
{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
|
||||
|
||||
|
||||
template <typename T>
|
||||
inline T divWrapQ(const T &v1, const T &v2, const float scale)
|
||||
{
|
||||
return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
|
||||
internal::vmovl(internal::vget_low(v2)), scale)),
|
||||
internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
|
||||
internal::vmovl(internal::vget_high(v2)), scale))
|
||||
);
|
||||
}
|
||||
template <>
|
||||
inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
|
||||
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
|
||||
template <>
|
||||
inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
|
||||
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
|
||||
|
||||
template <typename T>
|
||||
inline T divWrap(const T &v1, const T &v2, const float scale)
|
||||
{
|
||||
return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
|
||||
}
|
||||
template <>
|
||||
inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
|
||||
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
|
||||
template <>
|
||||
inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
|
||||
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
|
||||
|
||||
inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); }
|
||||
inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); }
|
||||
inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); }
|
||||
inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); }
|
||||
inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
|
||||
inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
|
||||
|
||||
inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); }
|
||||
inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); }
|
||||
inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); }
|
||||
inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); }
|
||||
inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
|
||||
inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void div(const Size2D &size,
|
||||
const T * src0Base, ptrdiff_t src0Stride,
|
||||
const T * src1Base, ptrdiff_t src1Stride,
|
||||
T * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
typedef typename internal::VecTraits<T>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<T>::vec64 vec64;
|
||||
|
||||
if (scale == 0.0f ||
|
||||
(std::numeric_limits<T>::is_integer &&
|
||||
(scale * std::numeric_limits<T>::max()) < 1.0f &&
|
||||
(scale * std::numeric_limits<T>::max()) > -1.0f))
|
||||
{
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
T * dst = internal::getRowPtr(dstBase, dstStride, y);
|
||||
std::memset(dst, 0, sizeof(T) * size.width);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t step128 = 16 / sizeof(T);
|
||||
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
|
||||
const size_t step64 = 8 / sizeof(T);
|
||||
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
T * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
if (cpolicy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
for (; j < roiw128; j += step128)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
vec128 v_src0 = internal::vld1q(src0 + j);
|
||||
vec128 v_src1 = internal::vld1q(src1 + j);
|
||||
|
||||
vec128 v_mask = vtstq(v_src1,v_src1);
|
||||
internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
|
||||
}
|
||||
for (; j < roiw64; j += step64)
|
||||
{
|
||||
vec64 v_src0 = internal::vld1(src0 + j);
|
||||
vec64 v_src1 = internal::vld1(src1 + j);
|
||||
|
||||
vec64 v_mask = vtst(v_src1,v_src1);
|
||||
internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
|
||||
}
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
|
||||
}
|
||||
}
|
||||
else // CONVERT_POLICY_WRAP
|
||||
{
|
||||
for (; j < roiw128; j += step128)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
vec128 v_src0 = internal::vld1q(src0 + j);
|
||||
vec128 v_src1 = internal::vld1q(src1 + j);
|
||||
|
||||
vec128 v_mask = vtstq(v_src1,v_src1);
|
||||
internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
|
||||
}
|
||||
for (; j < roiw64; j += step64)
|
||||
{
|
||||
vec64 v_src0 = internal::vld1(src0 + j);
|
||||
vec64 v_src1 = internal::vld1(src1 + j);
|
||||
|
||||
vec64 v_mask = vtst(v_src1,v_src1);
|
||||
internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
|
||||
}
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)cpolicy;
|
||||
(void)scale;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
template <typename T>
|
||||
inline T recipSaturateQ(const T &v2, const float scale)
|
||||
{
|
||||
return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
|
||||
internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
|
||||
);
|
||||
}
|
||||
template <>
|
||||
inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
|
||||
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
|
||||
template <>
|
||||
inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
|
||||
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
|
||||
|
||||
template <typename T>
|
||||
inline T recipSaturate(const T &v2, const float scale)
|
||||
{
|
||||
return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
|
||||
}
|
||||
template <>
|
||||
inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
|
||||
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
|
||||
template <>
|
||||
inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
|
||||
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
|
||||
|
||||
|
||||
template <typename T>
|
||||
inline T recipWrapQ(const T &v2, const float scale)
|
||||
{
|
||||
return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
|
||||
internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
|
||||
);
|
||||
}
|
||||
template <>
|
||||
inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
|
||||
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
|
||||
template <>
|
||||
inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
|
||||
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
|
||||
|
||||
template <typename T>
|
||||
inline T recipWrap(const T &v2, const float scale)
|
||||
{
|
||||
return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
|
||||
}
|
||||
template <>
|
||||
inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
|
||||
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
|
||||
template <>
|
||||
inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
|
||||
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void recip(const Size2D &size,
|
||||
const T * src1Base, ptrdiff_t src1Stride,
|
||||
T * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
typedef typename internal::VecTraits<T>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<T>::vec64 vec64;
|
||||
|
||||
if (scale == 0.0f ||
|
||||
(std::numeric_limits<T>::is_integer &&
|
||||
scale < 1.0f &&
|
||||
scale > -1.0f))
|
||||
{
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
T * dst = internal::getRowPtr(dstBase, dstStride, y);
|
||||
std::memset(dst, 0, sizeof(T) * size.width);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t step128 = 16 / sizeof(T);
|
||||
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
|
||||
const size_t step64 = 8 / sizeof(T);
|
||||
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
T * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
if (cpolicy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
for (; j < roiw128; j += step128)
|
||||
{
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
vec128 v_src1 = internal::vld1q(src1 + j);
|
||||
|
||||
vec128 v_mask = vtstq(v_src1,v_src1);
|
||||
internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
|
||||
}
|
||||
for (; j < roiw64; j += step64)
|
||||
{
|
||||
vec64 v_src1 = internal::vld1(src1 + j);
|
||||
|
||||
vec64 v_mask = vtst(v_src1,v_src1);
|
||||
internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
|
||||
}
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
|
||||
}
|
||||
}
|
||||
else // CONVERT_POLICY_WRAP
|
||||
{
|
||||
for (; j < roiw128; j += step128)
|
||||
{
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
vec128 v_src1 = internal::vld1q(src1 + j);
|
||||
|
||||
vec128 v_mask = vtstq(v_src1,v_src1);
|
||||
internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
|
||||
}
|
||||
for (; j < roiw64; j += step64)
|
||||
{
|
||||
vec64 v_src1 = internal::vld1(src1 + j);
|
||||
|
||||
vec64 v_mask = vtst(v_src1,v_src1);
|
||||
internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
|
||||
}
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)cpolicy;
|
||||
(void)scale;
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void div(const Size2D &size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const u8 * src1Base, ptrdiff_t src1Stride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void div(const Size2D &size,
|
||||
const s8 * src0Base, ptrdiff_t src0Stride,
|
||||
const s8 * src1Base, ptrdiff_t src1Stride,
|
||||
s8 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void div(const Size2D &size,
|
||||
const u16 * src0Base, ptrdiff_t src0Stride,
|
||||
const u16 * src1Base, ptrdiff_t src1Stride,
|
||||
u16 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void div(const Size2D &size,
|
||||
const s16 * src0Base, ptrdiff_t src0Stride,
|
||||
const s16 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void div(const Size2D &size,
|
||||
const s32 * src0Base, ptrdiff_t src0Stride,
|
||||
const s32 * src1Base, ptrdiff_t src1Stride,
|
||||
s32 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void div(const Size2D &size,
|
||||
const f32 * src0Base, ptrdiff_t src0Stride,
|
||||
const f32 * src1Base, ptrdiff_t src1Stride,
|
||||
f32 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (scale == 0.0f)
|
||||
{
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
|
||||
std::memset(dst, 0, sizeof(f32) * size.width);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
float32x4_t v_zero = vdupq_n_f32(0.0f);
|
||||
|
||||
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
|
||||
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
|
||||
|
||||
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
|
||||
{
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw128; j += 4)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
float32x4_t v_src0 = vld1q_f32(src0 + j);
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
|
||||
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
|
||||
vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
|
||||
}
|
||||
|
||||
for (; j < roiw64; j += 2)
|
||||
{
|
||||
float32x2_t v_src0 = vld1_f32(src0 + j);
|
||||
float32x2_t v_src1 = vld1_f32(src1 + j);
|
||||
|
||||
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
|
||||
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
|
||||
vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw128; j += 4)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
float32x4_t v_src0 = vld1q_f32(src0 + j);
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
|
||||
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
|
||||
vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
|
||||
internal::vrecpq_f32(v_src1))), v_mask)));
|
||||
}
|
||||
|
||||
for (; j < roiw64; j += 2)
|
||||
{
|
||||
float32x2_t v_src0 = vld1_f32(src0 + j);
|
||||
float32x2_t v_src1 = vld1_f32(src1 + j);
|
||||
|
||||
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
|
||||
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
|
||||
vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
|
||||
internal::vrecp_f32(v_src1))), v_mask)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)scale;
|
||||
#endif
|
||||
}
|
||||
|
||||
void reciprocal(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void reciprocal(const Size2D &size,
|
||||
const s8 * srcBase, ptrdiff_t srcStride,
|
||||
s8 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void reciprocal(const Size2D &size,
|
||||
const u16 * srcBase, ptrdiff_t srcStride,
|
||||
u16 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void reciprocal(const Size2D &size,
|
||||
const s16 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void reciprocal(const Size2D &size,
|
||||
const s32 * srcBase, ptrdiff_t srcStride,
|
||||
s32 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale,
|
||||
CONVERT_POLICY cpolicy)
|
||||
{
|
||||
recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
|
||||
}
|
||||
|
||||
void reciprocal(const Size2D &size,
|
||||
const f32 * srcBase, ptrdiff_t srcStride,
|
||||
f32 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (scale == 0.0f)
|
||||
{
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
|
||||
std::memset(dst, 0, sizeof(f32) * size.width);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
float32x4_t v_zero = vdupq_n_f32(0.0f);
|
||||
|
||||
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
|
||||
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
|
||||
|
||||
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
|
||||
{
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
|
||||
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw128; j += 4)
|
||||
{
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
|
||||
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
|
||||
vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
|
||||
}
|
||||
|
||||
for (; j < roiw64; j += 2)
|
||||
{
|
||||
float32x2_t v_src1 = vld1_f32(src1 + j);
|
||||
|
||||
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
|
||||
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
|
||||
vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? 1.0f / src1[j] : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
|
||||
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw128; j += 4)
|
||||
{
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
|
||||
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
|
||||
vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
|
||||
scale)),v_mask)));
|
||||
}
|
||||
|
||||
for (; j < roiw64; j += 2)
|
||||
{
|
||||
float32x2_t v_src1 = vld1_f32(src1 + j);
|
||||
|
||||
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
|
||||
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
|
||||
vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
|
||||
scale)), v_mask)));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
dst[j] = src1[j] ? scale / src1[j] : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)scale;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
260
3rdparty/carotene/src/dot_product.cpp
vendored
Normal file
260
3rdparty/carotene/src/dot_product.cpp
vendored
Normal file
@ -0,0 +1,260 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
f64 dotProduct(const Size2D &_size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const u8 * src1Base, ptrdiff_t src1Stride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (src0Stride == src1Stride &&
|
||||
src0Stride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
|
||||
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
|
||||
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
|
||||
#define DOT_UINT_BLOCKSIZE 66050*8
|
||||
f64 result = 0.0;
|
||||
for (size_t row = 0; row < size.height; ++row)
|
||||
{
|
||||
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
|
||||
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
|
||||
|
||||
size_t i = 0;
|
||||
uint64x2_t ws = vmovq_n_u64(0);
|
||||
|
||||
while(i + 16 <= size.width)
|
||||
{
|
||||
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
|
||||
|
||||
uint32x4_t s1 = vmovq_n_u32(0);
|
||||
uint32x4_t s2 = vmovq_n_u32(0);
|
||||
|
||||
for (; i <= lim; i += 16)
|
||||
{
|
||||
internal::prefetch(src0 + i);
|
||||
internal::prefetch(src1 + i);
|
||||
|
||||
uint8x16_t vs1 = vld1q_u8(src0 + i);
|
||||
uint8x16_t vs2 = vld1q_u8(src1 + i);
|
||||
|
||||
uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
|
||||
uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
|
||||
|
||||
s1 = vpadalq_u16(s1, vdot1);
|
||||
s2 = vpadalq_u16(s2, vdot2);
|
||||
}
|
||||
|
||||
ws = vpadalq_u32(ws, s1);
|
||||
ws = vpadalq_u32(ws, s2);
|
||||
}
|
||||
|
||||
if(i + 8 <= size.width)
|
||||
{
|
||||
uint8x8_t vs1 = vld1_u8(src0 + i);
|
||||
uint8x8_t vs2 = vld1_u8(src1 + i);
|
||||
|
||||
ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
|
||||
i += 8;
|
||||
}
|
||||
|
||||
result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
|
||||
|
||||
for (; i < size.width; ++i)
|
||||
result += s32(src0[i]) * s32(src1[i]);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
(void)_size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
f64 dotProduct(const Size2D &_size,
|
||||
const s8 * src0Base, ptrdiff_t src0Stride,
|
||||
const s8 * src1Base, ptrdiff_t src1Stride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (src0Stride == src1Stride &&
|
||||
src0Stride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
|
||||
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
|
||||
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
|
||||
#define DOT_INT_BLOCKSIZE 131070*8
|
||||
f64 result = 0.0;
|
||||
for (size_t row = 0; row < size.height; ++row)
|
||||
{
|
||||
const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
|
||||
const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
|
||||
|
||||
size_t i = 0;
|
||||
int64x2_t ws = vmovq_n_s64(0);
|
||||
|
||||
while(i + 16 <= size.width)
|
||||
{
|
||||
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
|
||||
|
||||
int32x4_t s1 = vmovq_n_s32(0);
|
||||
int32x4_t s2 = vmovq_n_s32(0);
|
||||
|
||||
for (; i <= lim; i += 16)
|
||||
{
|
||||
internal::prefetch(src0 + i);
|
||||
internal::prefetch(src1 + i);
|
||||
|
||||
int8x16_t vs1 = vld1q_s8(src0 + i);
|
||||
int8x16_t vs2 = vld1q_s8(src1 + i);
|
||||
|
||||
int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
|
||||
int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
|
||||
|
||||
s1 = vpadalq_s16(s1, vdot1);
|
||||
s2 = vpadalq_s16(s2, vdot2);
|
||||
}
|
||||
|
||||
ws = vpadalq_s32(ws, s1);
|
||||
ws = vpadalq_s32(ws, s2);
|
||||
}
|
||||
|
||||
if(i + 8 <= size.width)
|
||||
{
|
||||
int8x8_t vs1 = vld1_s8(src0 + i);
|
||||
int8x8_t vs2 = vld1_s8(src1 + i);
|
||||
|
||||
ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
|
||||
i += 8;
|
||||
}
|
||||
|
||||
result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
|
||||
|
||||
for (; i < size.width; ++i)
|
||||
result += s32(src0[i]) * s32(src1[i]);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
(void)_size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
f64 dotProduct(const Size2D &_size,
|
||||
const f32 * src0Base, ptrdiff_t src0Stride,
|
||||
const f32 * src1Base, ptrdiff_t src1Stride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (src0Stride == src1Stride &&
|
||||
src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
|
||||
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
|
||||
f64 result = 0.0;
|
||||
for (size_t row = 0; row < size.height; ++row)
|
||||
{
|
||||
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
|
||||
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
|
||||
|
||||
size_t i = 0;
|
||||
while(i + 4 <= size.width)
|
||||
{
|
||||
size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
|
||||
float32x4_t v_sum = vdupq_n_f32(0.0f);
|
||||
|
||||
for( ; i <= lim; i += 4 )
|
||||
{
|
||||
internal::prefetch(src0 + i);
|
||||
internal::prefetch(src1 + i);
|
||||
v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
|
||||
}
|
||||
|
||||
float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
|
||||
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
|
||||
}
|
||||
|
||||
if(i + 2 <= size.width)
|
||||
{
|
||||
float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
|
||||
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
|
||||
i += 2;
|
||||
}
|
||||
|
||||
for (; i < size.width; ++i)
|
||||
result += src0[i] * src1[i];
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
(void)_size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
428
3rdparty/carotene/src/fast.cpp
vendored
Normal file
428
3rdparty/carotene/src/fast.cpp
vendored
Normal file
@ -0,0 +1,428 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
|
||||
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
|
||||
Below is the original copyright and the references */
|
||||
|
||||
/*
|
||||
Copyright (c) 2006, 2008 Edward Rosten
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
*Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
*Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
*Neither the name of the University of Cambridge nor the names of
|
||||
its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
The references are:
|
||||
* Machine learning for high-speed corner detection,
|
||||
E. Rosten and T. Drummond, ECCV 2006
|
||||
* Faster and better: A machine learning approach to corner detection
|
||||
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
namespace
|
||||
{
|
||||
|
||||
void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
|
||||
{
|
||||
pixel[0] = 0 + row_stride * 3;
|
||||
pixel[1] = 1 + row_stride * 3;
|
||||
pixel[2] = 2 + row_stride * 2;
|
||||
pixel[3] = 3 + row_stride * 1;
|
||||
pixel[4] = 3 + row_stride * 0;
|
||||
pixel[5] = 3 + row_stride * -1;
|
||||
pixel[6] = 2 + row_stride * -2;
|
||||
pixel[7] = 1 + row_stride * -3;
|
||||
pixel[8] = 0 + row_stride * -3;
|
||||
pixel[9] = -1 + row_stride * -3;
|
||||
pixel[10] = -2 + row_stride * -2;
|
||||
pixel[11] = -3 + row_stride * -1;
|
||||
pixel[12] = -3 + row_stride * 0;
|
||||
pixel[13] = -3 + row_stride * 1;
|
||||
pixel[14] = -2 + row_stride * 2;
|
||||
pixel[15] = -1 + row_stride * 3;
|
||||
}
|
||||
|
||||
u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
|
||||
{
|
||||
const s32 K = 8, N = 16 + K + 1;
|
||||
s32 k, v = ptr[0];
|
||||
s16 d[(N + 7) & ~7];
|
||||
for( k = 0; k < N; k++ )
|
||||
d[k] = (s16)(v - ptr[pixel[k]]);
|
||||
|
||||
int16x8_t q0 = vdupq_n_s16((s16)(-1000));
|
||||
int16x8_t q1 = vdupq_n_s16((s16)(1000));
|
||||
|
||||
int16x8_t d0_7 = vld1q_s16(d + 0);
|
||||
int16x8_t d8_15 = vld1q_s16(d + 8);
|
||||
int16x8_t d16_23 = vld1q_s16(d + 16);
|
||||
int16x8_t d24 = vld1q_s16(d + 24);
|
||||
|
||||
//k == 0
|
||||
int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
|
||||
int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
|
||||
int16x8_t ak0 = vminq_s16(v0k0, v1k0);
|
||||
int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
|
||||
|
||||
v0k0 = vextq_s16(d0_7, d8_15, 3);
|
||||
ak0 = vminq_s16(ak0, v0k0);
|
||||
bk0 = vmaxq_s16(bk0, v0k0);
|
||||
|
||||
v1k0 = vextq_s16(d0_7, d8_15, 4);
|
||||
ak0 = vminq_s16(ak0, v1k0);
|
||||
bk0 = vmaxq_s16(bk0, v1k0);
|
||||
|
||||
v0k0 = vextq_s16(d0_7, d8_15, 5);
|
||||
ak0 = vminq_s16(ak0, v0k0);
|
||||
bk0 = vmaxq_s16(bk0, v0k0);
|
||||
|
||||
v1k0 = vextq_s16(d0_7, d8_15, 6);
|
||||
ak0 = vminq_s16(ak0, v1k0);
|
||||
bk0 = vmaxq_s16(bk0, v1k0);
|
||||
|
||||
v0k0 = vextq_s16(d0_7, d8_15, 7);
|
||||
ak0 = vminq_s16(ak0, v0k0);
|
||||
bk0 = vmaxq_s16(bk0, v0k0);
|
||||
|
||||
ak0 = vminq_s16(ak0, d8_15);
|
||||
bk0 = vmaxq_s16(bk0, d8_15);
|
||||
|
||||
q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
|
||||
q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
|
||||
|
||||
v1k0 = vextq_s16(d8_15, d16_23, 1);
|
||||
q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
|
||||
q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
|
||||
|
||||
//k == 8
|
||||
int16x8_t v0k8 = v1k0;
|
||||
int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
|
||||
int16x8_t ak8 = vminq_s16(v0k8, v1k8);
|
||||
int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
|
||||
|
||||
v0k8 = vextq_s16(d8_15, d16_23, 3);
|
||||
ak8 = vminq_s16(ak8, v0k8);
|
||||
bk8 = vmaxq_s16(bk8, v0k8);
|
||||
|
||||
v1k8 = vextq_s16(d8_15, d16_23, 4);
|
||||
ak8 = vminq_s16(ak8, v1k8);
|
||||
bk8 = vmaxq_s16(bk8, v1k8);
|
||||
|
||||
v0k8 = vextq_s16(d8_15, d16_23, 5);
|
||||
ak8 = vminq_s16(ak8, v0k8);
|
||||
bk8 = vmaxq_s16(bk8, v0k8);
|
||||
|
||||
v1k8 = vextq_s16(d8_15, d16_23, 6);
|
||||
ak8 = vminq_s16(ak8, v1k8);
|
||||
bk8 = vmaxq_s16(bk8, v1k8);
|
||||
|
||||
v0k8 = vextq_s16(d8_15, d16_23, 7);
|
||||
ak8 = vminq_s16(ak8, v0k8);
|
||||
bk8 = vmaxq_s16(bk8, v0k8);
|
||||
|
||||
ak8 = vminq_s16(ak8, d16_23);
|
||||
bk8 = vmaxq_s16(bk8, d16_23);
|
||||
|
||||
q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
|
||||
q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
|
||||
|
||||
v1k8 = vextq_s16(d16_23, d24, 1);
|
||||
q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
|
||||
q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
|
||||
|
||||
//fin
|
||||
int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
|
||||
int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
|
||||
int32x4_t q2w = vmovl_s16(q2);
|
||||
int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
|
||||
int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
|
||||
|
||||
return (u8)(vget_lane_s32(q8, 0) - 1);
|
||||
}
|
||||
|
||||
} //namespace
|
||||
#endif
|
||||
|
||||
void FAST(const Size2D &size,
|
||||
u8 *srcBase, ptrdiff_t srcStride,
|
||||
KeypointStore *keypoints,
|
||||
u8 threshold, bool nonmax_suppression)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
//keypoints.clear();
|
||||
|
||||
const s32 K = 8, N = 16 + K + 1;
|
||||
ptrdiff_t i, j, k, pixel[N];
|
||||
makeOffsets(pixel, srcStride);
|
||||
for(k = 16; k < N; k++)
|
||||
pixel[k] = pixel[k - 16];
|
||||
|
||||
uint8x16_t delta = vdupq_n_u8(128);
|
||||
uint8x16_t t = vdupq_n_u8(threshold);
|
||||
uint8x16_t K16 = vdupq_n_u8((u8)K);
|
||||
|
||||
u8 threshold_tab[512];
|
||||
for( i = -255; i <= 255; i++ )
|
||||
threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
|
||||
|
||||
std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
|
||||
u8* buf[3];
|
||||
buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
|
||||
ptrdiff_t* cpbuf[3];
|
||||
cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
|
||||
cpbuf[1] = cpbuf[0] + size.width + 1;
|
||||
cpbuf[2] = cpbuf[1] + size.width + 1;
|
||||
memset(buf[0], 0, size.width*3);
|
||||
|
||||
for(i = 3; i < (ptrdiff_t)size.height-2; i++)
|
||||
{
|
||||
const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
|
||||
u8* curr = buf[(i - 3)%3];
|
||||
ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
|
||||
memset(curr, 0, size.width);
|
||||
ptrdiff_t ncorners = 0;
|
||||
|
||||
if( i < (ptrdiff_t)size.height - 3 )
|
||||
{
|
||||
j = 3;
|
||||
|
||||
for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
|
||||
{
|
||||
internal::prefetch(ptr);
|
||||
internal::prefetch(ptr + pixel[0]);
|
||||
internal::prefetch(ptr + pixel[2]);
|
||||
|
||||
uint8x16_t v0 = vld1q_u8(ptr);
|
||||
int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
|
||||
int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
|
||||
|
||||
int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
|
||||
int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
|
||||
int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
|
||||
int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
|
||||
|
||||
uint8x16_t m0 = vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
|
||||
uint8x16_t m1 = vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
|
||||
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
|
||||
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
|
||||
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
|
||||
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
|
||||
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
|
||||
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
|
||||
m0 = vorrq_u8(m0, m1);
|
||||
|
||||
u64 mask[2];
|
||||
vst1q_u64(mask, vreinterpretq_u64_u8(m0));
|
||||
|
||||
if( mask[0] == 0 )
|
||||
{
|
||||
if (mask[1] != 0)
|
||||
{
|
||||
j -= 8;
|
||||
ptr -= 8;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
uint8x16_t c0 = vmovq_n_u8(0);
|
||||
uint8x16_t c1 = vmovq_n_u8(0);
|
||||
uint8x16_t max0 = vmovq_n_u8(0);
|
||||
uint8x16_t max1 = vmovq_n_u8(0);
|
||||
for( k = 0; k < N; k++ )
|
||||
{
|
||||
int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
|
||||
m0 = vcgtq_s8(x, v2);
|
||||
m1 = vcgtq_s8(v1, x);
|
||||
|
||||
c0 = vandq_u8(vsubq_u8(c0, m0), m0);
|
||||
c1 = vandq_u8(vsubq_u8(c1, m1), m1);
|
||||
|
||||
max0 = vmaxq_u8(max0, c0);
|
||||
max1 = vmaxq_u8(max1, c1);
|
||||
}
|
||||
|
||||
max0 = vmaxq_u8(max0, max1);
|
||||
u8 m[16];
|
||||
vst1q_u8(m, vcgtq_u8(max0, K16));
|
||||
|
||||
for( k = 0; k < 16; ++k )
|
||||
if(m[k])
|
||||
{
|
||||
cornerpos[ncorners++] = j+k;
|
||||
if(nonmax_suppression)
|
||||
curr[j+k] = cornerScore(ptr+k, pixel);
|
||||
}
|
||||
}
|
||||
|
||||
for( ; j < (s32)size.width - 3; j++, ptr++ )
|
||||
{
|
||||
s32 v = ptr[0];
|
||||
const u8* tab = &threshold_tab[0] - v + 255;
|
||||
s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
|
||||
|
||||
if( d == 0 )
|
||||
continue;
|
||||
|
||||
d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
|
||||
d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
|
||||
d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
|
||||
|
||||
if( d == 0 )
|
||||
continue;
|
||||
|
||||
d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
|
||||
d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
|
||||
d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
|
||||
d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
|
||||
|
||||
if( d & 1 )
|
||||
{
|
||||
s32 vt = v - threshold, count = 0;
|
||||
|
||||
for( k = 0; k < N; k++ )
|
||||
{
|
||||
s32 x = ptr[pixel[k]];
|
||||
if(x < vt)
|
||||
{
|
||||
if( ++count > K )
|
||||
{
|
||||
cornerpos[ncorners++] = j;
|
||||
if(nonmax_suppression)
|
||||
curr[j] = cornerScore(ptr, pixel);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if( d & 2 )
|
||||
{
|
||||
s32 vt = v + threshold, count = 0;
|
||||
|
||||
for( k = 0; k < N; k++ )
|
||||
{
|
||||
s32 x = ptr[pixel[k]];
|
||||
if(x > vt)
|
||||
{
|
||||
if( ++count > K )
|
||||
{
|
||||
cornerpos[ncorners++] = j;
|
||||
if(nonmax_suppression)
|
||||
curr[j] = cornerScore(ptr, pixel);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cornerpos[-1] = ncorners;
|
||||
|
||||
if( i == 3 )
|
||||
continue;
|
||||
|
||||
const u8* prev = buf[(i - 4 + 3)%3];
|
||||
const u8* pprev = buf[(i - 5 + 3)%3];
|
||||
cornerpos = cpbuf[(i - 4 + 3)%3];
|
||||
ncorners = cornerpos[-1];
|
||||
|
||||
for( k = 0; k < ncorners; k++ )
|
||||
{
|
||||
j = cornerpos[k];
|
||||
s32 score = prev[j];
|
||||
if( !nonmax_suppression ||
|
||||
(score > prev[j+1] && score > prev[j-1] &&
|
||||
score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
|
||||
score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
|
||||
{
|
||||
keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)keypoints;
|
||||
(void)threshold;
|
||||
(void)nonmax_suppression;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
442
3rdparty/carotene/src/fill_minmaxloc.cpp
vendored
Normal file
442
3rdparty/carotene/src/fill_minmaxloc.cpp
vendored
Normal file
@ -0,0 +1,442 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
void process(const T * src, size_t j0, size_t j1, size_t i,
|
||||
T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
|
||||
T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
|
||||
{
|
||||
for (size_t j = j0; j < j1; ++j)
|
||||
{
|
||||
T val = src[j];
|
||||
|
||||
if (val == maxVal)
|
||||
{
|
||||
if (maxLocCount < maxLocCapacity)
|
||||
{
|
||||
maxLocPtr[maxLocCount] = j;
|
||||
maxLocPtr[maxLocCount + 1] = i;
|
||||
}
|
||||
maxLocCount += 2;
|
||||
}
|
||||
|
||||
if (val == minVal)
|
||||
{
|
||||
if (minLocCount < minLocCapacity)
|
||||
{
|
||||
minLocPtr[minLocCount] = j;
|
||||
minLocPtr[minLocCount + 1] = i;
|
||||
}
|
||||
minLocCount += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void fillMinMaxLocs(const Size2D & size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
|
||||
u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
|
||||
uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
|
||||
|
||||
u64 mask[2] = { 0ul };
|
||||
|
||||
minLocCapacity <<= 1;
|
||||
maxLocCapacity <<= 1;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for ( ; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
uint8x16_t v_src = vld1q_u8(src + j);
|
||||
|
||||
uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
|
||||
uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
|
||||
uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
|
||||
|
||||
vst1q_u8((u8 *)&mask[0], v_mask);
|
||||
|
||||
if (mask[0])
|
||||
process(src, j, j + 8, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
if (mask[1])
|
||||
process(src, j + 8, j + 16, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
for ( ; j < roiw8; j += 8)
|
||||
{
|
||||
uint8x8_t v_src = vld1_u8(src + j);
|
||||
|
||||
uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
|
||||
uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
|
||||
uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
|
||||
|
||||
vst1_u8((u8 *)&mask[0], v_mask);
|
||||
|
||||
if (mask[0])
|
||||
process(src, j, j + 8, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
process(src, j, size.width, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
minLocCount >>= 1;
|
||||
maxLocCount >>= 1;
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)minVal;
|
||||
(void)minLocPtr;
|
||||
(void)minLocCount;
|
||||
(void)minLocCapacity;
|
||||
(void)maxVal;
|
||||
(void)maxLocPtr;
|
||||
(void)maxLocCount;
|
||||
(void)maxLocCapacity;
|
||||
#endif
|
||||
}
|
||||
|
||||
void fillMinMaxLocs(const Size2D & size,
|
||||
const u16 * srcBase, ptrdiff_t srcStride,
|
||||
u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
|
||||
u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
|
||||
v_minval8 = vdupq_n_u16(minVal);
|
||||
u64 mask[2] = { 0ul };
|
||||
|
||||
minLocCapacity <<= 1;
|
||||
maxLocCapacity <<= 1;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for ( ; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
|
||||
|
||||
uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
|
||||
uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
|
||||
|
||||
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
|
||||
|
||||
if (mask[0])
|
||||
process(src, j, j + 8, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
if (mask[1])
|
||||
process(src, j + 8, j + 16, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
for ( ; j < roiw8; j += 8)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
uint16x8_t v_src = vld1q_u16(src + j);
|
||||
|
||||
uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
|
||||
uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
|
||||
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
|
||||
|
||||
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
|
||||
|
||||
if (mask[0])
|
||||
process(src, j, j + 8, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
process(src, j, size.width, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
minLocCount >>= 1;
|
||||
maxLocCount >>= 1;
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)minVal;
|
||||
(void)minLocPtr;
|
||||
(void)minLocCount;
|
||||
(void)minLocCapacity;
|
||||
(void)maxVal;
|
||||
(void)maxLocPtr;
|
||||
(void)maxLocCount;
|
||||
(void)maxLocCapacity;
|
||||
#endif
|
||||
}
|
||||
|
||||
void fillMinMaxLocs(const Size2D & size,
|
||||
const s16 * srcBase, ptrdiff_t srcStride,
|
||||
s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
|
||||
s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
|
||||
v_minval8 = vdupq_n_s16(minVal);
|
||||
u64 mask[2] = { 0ul };
|
||||
|
||||
minLocCapacity <<= 1;
|
||||
maxLocCapacity <<= 1;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for ( ; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
|
||||
|
||||
uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
|
||||
uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
|
||||
|
||||
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
|
||||
|
||||
if (mask[0])
|
||||
process(src, j, j + 8, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
if (mask[1])
|
||||
process(src, j + 8, j + 16, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
for ( ; j < roiw8; j += 8)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
int16x8_t v_src = vld1q_s16(src + j);
|
||||
|
||||
uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
|
||||
uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
|
||||
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
|
||||
|
||||
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
|
||||
|
||||
if (mask[0])
|
||||
process(src, j, j + 8, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
process(src, j, size.width, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
minLocCount >>= 1;
|
||||
maxLocCount >>= 1;
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)minVal;
|
||||
(void)minLocPtr;
|
||||
(void)minLocCount;
|
||||
(void)minLocCapacity;
|
||||
(void)maxVal;
|
||||
(void)maxLocPtr;
|
||||
(void)maxLocCount;
|
||||
(void)maxLocCapacity;
|
||||
#endif
|
||||
}
|
||||
|
||||
void fillMinMaxLocs(const Size2D & size,
|
||||
const s32 * srcBase, ptrdiff_t srcStride,
|
||||
s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
|
||||
s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
|
||||
v_minval4 = vdupq_n_s32(minVal);
|
||||
u64 mask = 0ul;
|
||||
|
||||
minLocCapacity <<= 1;
|
||||
maxLocCapacity <<= 1;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for ( ; j < roiw8; j += 8)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
|
||||
|
||||
uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
|
||||
uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
|
||||
|
||||
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
|
||||
|
||||
if (mask)
|
||||
process(src, j, j + 8, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
process(src, j, size.width, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
minLocCount >>= 1;
|
||||
maxLocCount >>= 1;
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)minVal;
|
||||
(void)minLocPtr;
|
||||
(void)minLocCount;
|
||||
(void)minLocCapacity;
|
||||
(void)maxVal;
|
||||
(void)maxLocPtr;
|
||||
(void)maxLocCount;
|
||||
(void)maxLocCapacity;
|
||||
#endif
|
||||
}
|
||||
|
||||
void fillMinMaxLocs(const Size2D & size,
|
||||
const u32 * srcBase, ptrdiff_t srcStride,
|
||||
u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
|
||||
u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
|
||||
v_minval4 = vdupq_n_u32(minVal);
|
||||
u64 mask = 0ul;
|
||||
|
||||
minLocCapacity <<= 1;
|
||||
maxLocCapacity <<= 1;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for ( ; j < roiw8; j += 8)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
|
||||
|
||||
uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
|
||||
uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
|
||||
|
||||
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
|
||||
|
||||
if (mask)
|
||||
process(src, j, j + 8, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
process(src, j, size.width, i,
|
||||
minVal, minLocPtr, minLocCount, minLocCapacity,
|
||||
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
|
||||
}
|
||||
|
||||
minLocCount >>= 1;
|
||||
maxLocCount >>= 1;
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)minVal;
|
||||
(void)minLocPtr;
|
||||
(void)minLocCount;
|
||||
(void)minLocCapacity;
|
||||
(void)maxVal;
|
||||
(void)maxLocPtr;
|
||||
(void)maxLocCount;
|
||||
(void)maxLocCapacity;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
222
3rdparty/carotene/src/flip.cpp
vendored
Normal file
222
3rdparty/carotene/src/flip.cpp
vendored
Normal file
@ -0,0 +1,222 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
|
||||
{
|
||||
bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
|
||||
return isSupportedConfiguration() &&
|
||||
((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
|
||||
(flipMode == FLIP_VERTICAL_MODE));
|
||||
}
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
void flip(const Size2D & size,
|
||||
const void * srcBase, ptrdiff_t srcStride,
|
||||
void * dstBase, ptrdiff_t dstStride,
|
||||
FLIP_MODE flipMode)
|
||||
{
|
||||
using namespace internal;
|
||||
|
||||
typedef typename VecTraits<T>::vec128 vec128;
|
||||
typedef typename VecTraits<T>::vec64 vec64;
|
||||
|
||||
u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
|
||||
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
|
||||
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
|
||||
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
|
||||
size_t js = 0, jd = size.width;
|
||||
|
||||
for (; js < roiw_base; js += step_base, jd -= step_base)
|
||||
{
|
||||
prefetch(src + js);
|
||||
|
||||
vec128 v_src = vld1q(src + js);
|
||||
vec128 v_dst = vrev64q(v_src);
|
||||
v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
|
||||
vst1q(dst + jd - step_base, v_dst);
|
||||
}
|
||||
for (; js < roiw_tail; js += step_tail, jd -= step_tail)
|
||||
{
|
||||
vec64 v_src = vld1(src + js);
|
||||
vst1(dst + jd - step_tail, vrev64(v_src));
|
||||
}
|
||||
|
||||
for (--jd; js < size.width; ++js, --jd)
|
||||
dst[jd] = src[js];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void flip3(const Size2D & size,
|
||||
const void * srcBase, ptrdiff_t srcStride,
|
||||
void * dstBase, ptrdiff_t dstStride,
|
||||
FLIP_MODE flipMode)
|
||||
{
|
||||
using namespace internal;
|
||||
|
||||
#ifndef ANDROID
|
||||
typedef typename VecTraits<T, 3>::vec128 vec128;
|
||||
#endif
|
||||
typedef typename VecTraits<T, 3>::vec64 vec64;
|
||||
|
||||
#ifndef ANDROID
|
||||
u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
|
||||
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
|
||||
#endif
|
||||
u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
|
||||
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
|
||||
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
|
||||
size_t j = 0, js = 0, jd = size.width * 3;
|
||||
|
||||
#ifndef ANDROID
|
||||
for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
|
||||
{
|
||||
prefetch(src + js);
|
||||
|
||||
vec128 v_src = vld3q(src + js), v_dst;
|
||||
v_src.val[0] = vrev64q(v_src.val[0]);
|
||||
v_src.val[1] = vrev64q(v_src.val[1]);
|
||||
v_src.val[2] = vrev64q(v_src.val[2]);
|
||||
|
||||
v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
|
||||
v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
|
||||
v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
|
||||
|
||||
vst3q(dst + jd - step_base3, v_dst);
|
||||
}
|
||||
#endif // ANDROID
|
||||
|
||||
for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
|
||||
{
|
||||
vec64 v_src = vld3(src + js), v_dst;
|
||||
v_dst.val[0] = vrev64(v_src.val[0]);
|
||||
v_dst.val[1] = vrev64(v_src.val[1]);
|
||||
v_dst.val[2] = vrev64(v_src.val[2]);
|
||||
|
||||
vst3(dst + jd - step_tail3, v_dst);
|
||||
}
|
||||
|
||||
for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
|
||||
{
|
||||
dst[jd] = src[js];
|
||||
dst[jd + 1] = src[js + 1];
|
||||
dst[jd + 2] = src[js + 2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (* flipFunc)(const Size2D &size,
|
||||
const void * srcBase, ptrdiff_t srcStride,
|
||||
void * dstBase, ptrdiff_t dstStride,
|
||||
FLIP_MODE flipMode);
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void flip(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
FLIP_MODE flipMode, u32 elemSize)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
if (flipMode == FLIP_VERTICAL_MODE)
|
||||
{
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
|
||||
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
|
||||
|
||||
std::memcpy(dst_row, src_row, elemSize * size.width);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
flipFunc func = NULL;
|
||||
|
||||
if (elemSize == (u32)sizeof(u8))
|
||||
func = &flip<u8>;
|
||||
if (elemSize == (u32)sizeof(u16))
|
||||
func = &flip<u16>;
|
||||
if (elemSize == (u32)sizeof(u32))
|
||||
func = &flip<u32>;
|
||||
if (elemSize == (u32)sizeof(u8) * 3)
|
||||
func = &flip3<u8>;
|
||||
|
||||
if (func == NULL)
|
||||
return;
|
||||
|
||||
func(size,
|
||||
srcBase, srcStride,
|
||||
dstBase, dstStride,
|
||||
flipMode);
|
||||
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)flipMode;
|
||||
(void)elemSize;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
1059
3rdparty/carotene/src/gaussian_blur.cpp
vendored
Normal file
1059
3rdparty/carotene/src/gaussian_blur.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
195
3rdparty/carotene/src/in_range.cpp
vendored
Normal file
195
3rdparty/carotene/src/in_range.cpp
vendored
Normal file
@ -0,0 +1,195 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
|
||||
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
|
||||
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
|
||||
|
||||
template <typename T, int elsize> struct vtail
|
||||
{
|
||||
static inline void inRange(const T *, const T *, const T *,
|
||||
u8 *, size_t &, size_t)
|
||||
{
|
||||
//do nothing since there couldn't be enough data
|
||||
}
|
||||
};
|
||||
template <typename T> struct vtail<T, 2>
|
||||
{
|
||||
static inline void inRange(const T * src, const T * rng1, const T * rng2,
|
||||
u8 * dst, size_t &x, size_t width)
|
||||
{
|
||||
typedef typename internal::VecTraits<T>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
|
||||
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
|
||||
if( x + 8 < width)
|
||||
{
|
||||
vec128 vs = internal::vld1q( src + x);
|
||||
vec128 vr1 = internal::vld1q(rng1 + x);
|
||||
vec128 vr2 = internal::vld1q(rng2 + x);
|
||||
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
|
||||
internal::vst1(dst + x, internal::vmovn(vd));
|
||||
x+=8;
|
||||
}
|
||||
}
|
||||
};
|
||||
template <typename T> struct vtail<T, 1>
|
||||
{
|
||||
static inline void inRange(const T * src, const T * rng1, const T * rng2,
|
||||
u8 * dst, size_t &x, size_t width)
|
||||
{
|
||||
typedef typename internal::VecTraits<T>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
|
||||
typedef typename internal::VecTraits<T>::vec64 vec64;
|
||||
typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
|
||||
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
|
||||
if( x + 16 < width)
|
||||
{
|
||||
vec128 vs = internal::vld1q( src + x);
|
||||
vec128 vr1 = internal::vld1q(rng1 + x);
|
||||
vec128 vr2 = internal::vld1q(rng2 + x);
|
||||
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
|
||||
internal::vst1q(dst + x, vd);
|
||||
x+=16;
|
||||
}
|
||||
if( x + 8 < width)
|
||||
{
|
||||
vec64 vs = internal::vld1( src + x);
|
||||
vec64 vr1 = internal::vld1(rng1 + x);
|
||||
vec64 vr2 = internal::vld1(rng2 + x);
|
||||
uvec64 vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
|
||||
internal::vst1(dst + x, vd);
|
||||
x+=8;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline void inRangeCheck(const Size2D &_size,
|
||||
const T * srcBase, ptrdiff_t srcStride,
|
||||
const T * rng1Base, ptrdiff_t rng1Stride,
|
||||
const T * rng2Base, ptrdiff_t rng2Stride,
|
||||
u8 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
typedef typename internal::VecTraits<T>::vec128 vec128;
|
||||
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
|
||||
|
||||
Size2D size(_size);
|
||||
if (srcStride == dstStride &&
|
||||
srcStride == rng1Stride &&
|
||||
srcStride == rng2Stride &&
|
||||
srcStride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
const size_t width = size.width & ~( 32/sizeof(T) - 1 );
|
||||
|
||||
for(size_t j = 0; j < size.height; ++j)
|
||||
{
|
||||
const T * src = internal::getRowPtr( srcBase, srcStride, j);
|
||||
const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
|
||||
const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
|
||||
u8 * dst = internal::getRowPtr( dstBase, dstStride, j);
|
||||
size_t i = 0;
|
||||
for( ; i < width; i += 32/sizeof(T) )
|
||||
{
|
||||
internal::prefetch(src + i);
|
||||
internal::prefetch(rng1 + i);
|
||||
internal::prefetch(rng2 + i);
|
||||
|
||||
vec128 vs = internal::vld1q( src + i);
|
||||
vec128 vr1 = internal::vld1q(rng1 + i);
|
||||
vec128 vr2 = internal::vld1q(rng2 + i);
|
||||
uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
|
||||
vs = internal::vld1q( src + i + 16/sizeof(T));
|
||||
vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
|
||||
vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
|
||||
uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
|
||||
vnst(dst + i, vd1, vd2);
|
||||
}
|
||||
vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
|
||||
for( ; i < size.width; i++ )
|
||||
dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#define INRANGEFUNC(T) \
|
||||
void inRange(const Size2D &_size, \
|
||||
const T * srcBase, ptrdiff_t srcStride, \
|
||||
const T * rng1Base, ptrdiff_t rng1Stride, \
|
||||
const T * rng2Base, ptrdiff_t rng2Stride, \
|
||||
u8 * dstBase, ptrdiff_t dstStride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
inRangeCheck(_size, srcBase, srcStride, \
|
||||
rng1Base, rng1Stride, rng2Base, rng2Stride, \
|
||||
dstBase, dstStride); \
|
||||
}
|
||||
#else
|
||||
#define INRANGEFUNC(T) \
|
||||
void inRange(const Size2D &, \
|
||||
const T *, ptrdiff_t, \
|
||||
const T *, ptrdiff_t, \
|
||||
const T *, ptrdiff_t, \
|
||||
u8 *, ptrdiff_t) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
}
|
||||
#endif
|
||||
|
||||
INRANGEFUNC(u8)
|
||||
INRANGEFUNC(s8)
|
||||
INRANGEFUNC(u16)
|
||||
INRANGEFUNC(s16)
|
||||
INRANGEFUNC(s32)
|
||||
INRANGEFUNC(f32)
|
||||
|
||||
} // namespace CAROTENE_NS
|
238
3rdparty/carotene/src/integral.cpp
vendored
Normal file
238
3rdparty/carotene/src/integral.cpp
vendored
Normal file
@ -0,0 +1,238 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
void integral(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u32 * sumBase, ptrdiff_t sumStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
uint32x4_t v_zero = vmovq_n_u32(0u);
|
||||
|
||||
// the first iteration
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
|
||||
u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
|
||||
|
||||
uint32x4_t prev = v_zero;
|
||||
size_t j = 0u;
|
||||
|
||||
for ( ; j + 7 < size.width; j += 8)
|
||||
{
|
||||
internal::prefetch(sum + j);
|
||||
internal::prefetch(src + j);
|
||||
|
||||
uint8x8_t el8shr0 = vld1_u8(src + j);
|
||||
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
|
||||
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
|
||||
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
|
||||
|
||||
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
|
||||
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
|
||||
|
||||
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
|
||||
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
|
||||
|
||||
uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
|
||||
uint32x4_t vsumh = vaddw_u16(prev, el4h);
|
||||
|
||||
vst1q_u32(sum + j, vsuml);
|
||||
vst1q_u32(sum + j + 4, vsumh);
|
||||
|
||||
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
|
||||
}
|
||||
|
||||
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
|
||||
sum[j] = (v += src[j]);
|
||||
|
||||
// the others
|
||||
for (size_t i = 1; i < size.height ; ++i)
|
||||
{
|
||||
src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
|
||||
sum = internal::getRowPtr(sumBase, sumStride, i);
|
||||
|
||||
prev = v_zero;
|
||||
j = 0u;
|
||||
|
||||
for ( ; j + 7 < size.width; j += 8)
|
||||
{
|
||||
internal::prefetch(sum + j);
|
||||
internal::prefetch(src + j);
|
||||
|
||||
uint32x4_t vsuml = vld1q_u32(prevSum + j);
|
||||
uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
|
||||
|
||||
uint8x8_t el8shr0 = vld1_u8(src + j);
|
||||
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
|
||||
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
|
||||
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
|
||||
|
||||
vsuml = vaddq_u32(vsuml, prev);
|
||||
vsumh = vaddq_u32(vsumh, prev);
|
||||
|
||||
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
|
||||
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
|
||||
|
||||
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
|
||||
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
|
||||
|
||||
vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
|
||||
vsumh = vaddw_u16(vsumh, el4h);
|
||||
|
||||
vst1q_u32(sum + j, vsuml);
|
||||
vst1q_u32(sum + j + 4, vsumh);
|
||||
|
||||
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
|
||||
}
|
||||
|
||||
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
|
||||
sum[j] = (v += src[j]) + prevSum[j];
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)sumBase;
|
||||
(void)sumStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sqrIntegral(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
f64 * sqsumBase, ptrdiff_t sqsumStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
uint16x8_t v_zero8 = vmovq_n_u16(0u);
|
||||
|
||||
// the first iteration
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
|
||||
f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
|
||||
|
||||
double prev = 0.;
|
||||
size_t j = 0u;
|
||||
|
||||
for ( ; j + 7 < size.width; j += 8)
|
||||
{
|
||||
internal::prefetch(sqsum + j);
|
||||
internal::prefetch(src + j);
|
||||
|
||||
uint8x8_t vsrc = vld1_u8(src + j);
|
||||
|
||||
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
|
||||
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
|
||||
|
||||
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
|
||||
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
|
||||
|
||||
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
|
||||
|
||||
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
|
||||
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
|
||||
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
|
||||
|
||||
u32 buf[8];
|
||||
vst1_u32(buf, vget_low_u32(el8shr01l));
|
||||
vst1_u32(buf+2, el2l);
|
||||
vst1_u32(buf+4, el2hl);
|
||||
vst1_u32(buf+6, el2hh);
|
||||
for(u32 k=0; k < 8; k++)
|
||||
sqsum[j+k] = prev + buf[k];
|
||||
prev += buf[7];
|
||||
}
|
||||
|
||||
for (; j < size.width; ++j)
|
||||
sqsum[j] = (prev += src[j]*src[j]);
|
||||
|
||||
// the others
|
||||
for (size_t i = 1; i < size.height ; ++i)
|
||||
{
|
||||
src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
|
||||
sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
|
||||
|
||||
prev = 0.;
|
||||
j = 0u;
|
||||
|
||||
for ( ; j + 7 < size.width; j += 8)
|
||||
{
|
||||
internal::prefetch(sqsum + j);
|
||||
internal::prefetch(src + j);
|
||||
|
||||
uint8x8_t vsrc = vld1_u8(src + j);
|
||||
|
||||
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
|
||||
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
|
||||
|
||||
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
|
||||
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
|
||||
|
||||
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
|
||||
|
||||
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
|
||||
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
|
||||
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
|
||||
|
||||
u32 buf[8];
|
||||
vst1_u32(buf, vget_low_u32(el8shr01l));
|
||||
vst1_u32(buf+2, el2l);
|
||||
vst1_u32(buf+4, el2hl);
|
||||
vst1_u32(buf+6, el2hh);
|
||||
for(u32 k=0; k < 8; k++)
|
||||
sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
|
||||
prev += buf[7];
|
||||
}
|
||||
|
||||
for (; j < size.width; ++j)
|
||||
sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)sqsumBase;
|
||||
(void)sqsumStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
112
3rdparty/carotene/src/intrinsics.hpp
vendored
Normal file
112
3rdparty/carotene/src/intrinsics.hpp
vendored
Normal file
@ -0,0 +1,112 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#ifndef CAROTENE_INTRINSICS_HPP
|
||||
#define CAROTENE_INTRINSICS_HPP
|
||||
|
||||
#include <carotene/definitions.hpp>
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
namespace CAROTENE_NS { namespace internal {
|
||||
|
||||
/////////////// Custom NEON intrinsics ///////////////////
|
||||
|
||||
// calculate reciprocal value
|
||||
|
||||
inline float32x4_t vrecpq_f32(float32x4_t val)
|
||||
{
|
||||
float32x4_t reciprocal = vrecpeq_f32(val);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
|
||||
return reciprocal;
|
||||
}
|
||||
|
||||
inline float32x2_t vrecp_f32(float32x2_t val)
|
||||
{
|
||||
float32x2_t reciprocal = vrecpe_f32(val);
|
||||
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
|
||||
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
|
||||
return reciprocal;
|
||||
}
|
||||
|
||||
// caclulate sqrt value
|
||||
|
||||
inline float32x4_t vrsqrtq_f32(float32x4_t val)
|
||||
{
|
||||
float32x4_t e = vrsqrteq_f32(val);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
|
||||
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
|
||||
return e;
|
||||
}
|
||||
|
||||
inline float32x2_t vrsqrt_f32(float32x2_t val)
|
||||
{
|
||||
float32x2_t e = vrsqrte_f32(val);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
|
||||
return e;
|
||||
}
|
||||
|
||||
inline float32x4_t vsqrtq_f32(float32x4_t val)
|
||||
{
|
||||
return vrecpq_f32(vrsqrtq_f32(val));
|
||||
}
|
||||
|
||||
inline float32x2_t vsqrt_f32(float32x2_t val)
|
||||
{
|
||||
return vrecp_f32(vrsqrt_f32(val));
|
||||
}
|
||||
|
||||
// table lookup with the table in a 128-bit register
|
||||
|
||||
inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
// AArch64 supports this natively
|
||||
return ::vqtbl1_u8(a, b);
|
||||
#else
|
||||
union { uint8x16_t v; uint8x8x2_t w; } u = { a };
|
||||
return vtbl2_u8(u.w, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
} }
|
||||
|
||||
#endif
|
713
3rdparty/carotene/src/laplacian.cpp
vendored
Normal file
713
3rdparty/carotene/src/laplacian.cpp
vendored
Normal file
@ -0,0 +1,713 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "saturate_cast.hpp"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
|
||||
{
|
||||
return isSupportedConfiguration() && size.width >= 8 &&
|
||||
(border == BORDER_MODE_CONSTANT ||
|
||||
border == BORDER_MODE_REPLICATE);
|
||||
}
|
||||
|
||||
void Laplacian3x3(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE border, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
|
||||
#ifdef CAROTENE_NEON
|
||||
const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
|
||||
const uint16x8_t v_zero = vdupq_n_u16(0);
|
||||
const uint8x8_t v_border = vdup_n_u8(borderValue);
|
||||
|
||||
uint8x8_t vsub;
|
||||
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
|
||||
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
|
||||
|
||||
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
|
||||
|
||||
for (ptrdiff_t y = 0; y < height; ++y)
|
||||
{
|
||||
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
|
||||
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
|
||||
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
|
||||
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
s16 prevx = 0, currx = 0, nextx = 0;
|
||||
ptrdiff_t x = 0;
|
||||
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
|
||||
|
||||
// perform vertical convolution
|
||||
for ( ; x <= bwidth; x += 8)
|
||||
{
|
||||
internal::prefetch(srow0 + x);
|
||||
internal::prefetch(srow1 + x);
|
||||
internal::prefetch(srow2 + x);
|
||||
|
||||
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
|
||||
uint8x8_t x1 = vld1_u8(srow1 + x);
|
||||
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
|
||||
|
||||
// calculate values for plain CPU part below if needed
|
||||
if (x + 8 >= bwidth)
|
||||
{
|
||||
ptrdiff_t x3 = x == width ? width - 1 : x;
|
||||
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
|
||||
|
||||
if (border == BORDER_MODE_CONSTANT && x4 < 0)
|
||||
prevx = borderValue;
|
||||
else
|
||||
prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
|
||||
|
||||
currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
|
||||
}
|
||||
|
||||
// make shift
|
||||
if (x)
|
||||
{
|
||||
tprev = tcurr;
|
||||
tcurr = tnext;
|
||||
}
|
||||
|
||||
// and calculate next value
|
||||
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
|
||||
|
||||
// make extrapolation for the first elements
|
||||
if (!x)
|
||||
{
|
||||
// make border
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
tcurr = v_border_x3;
|
||||
else if (border == BORDER_MODE_REPLICATE)
|
||||
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
|
||||
|
||||
vsub = x1;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// combine 3 "shifted" vectors
|
||||
t0 = vextq_u16(tprev, tcurr, 7);
|
||||
t1 = tcurr;
|
||||
t2 = vextq_u16(tcurr, tnext, 1);
|
||||
|
||||
// and add them
|
||||
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
|
||||
|
||||
int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
|
||||
vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
|
||||
uint8x8_t it0 = vqmovun_s16(tt0);
|
||||
vst1_u8(drow + x - 8, it0);
|
||||
|
||||
vsub = x1;
|
||||
}
|
||||
|
||||
x -= 8;
|
||||
if (x == width)
|
||||
--x;
|
||||
|
||||
for ( ; x < width; ++x)
|
||||
{
|
||||
// make extrapolation for the last elements
|
||||
if (x + 1 >= width)
|
||||
{
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
nextx = borderValue * 3;
|
||||
else if (border == BORDER_MODE_REPLICATE)
|
||||
nextx = srow2[x] + srow1[x] + srow0[x];
|
||||
}
|
||||
else
|
||||
{
|
||||
nextx = (srow2 ? srow2[x + 1] : borderValue) +
|
||||
srow1[x + 1] +
|
||||
(srow0 ? srow0[x + 1] : borderValue);
|
||||
}
|
||||
|
||||
s32 val = (prevx + currx + nextx) - 9 * srow1[x];
|
||||
drow[x] = internal::saturate_cast<u8>((s32)val);
|
||||
|
||||
// make shift
|
||||
prevx = currx;
|
||||
currx = nextx;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)border;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
|
||||
{
|
||||
return isSupportedConfiguration() &&
|
||||
size.width >= 8 && size.height >= 1 &&
|
||||
(border == BORDER_MODE_CONSTANT ||
|
||||
border == BORDER_MODE_REFLECT ||
|
||||
border == BORDER_MODE_REFLECT101 ||
|
||||
border == BORDER_MODE_REPLICATE);
|
||||
}
|
||||
|
||||
void Laplacian1OpenCV(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE border, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
|
||||
#ifdef CAROTENE_NEON
|
||||
ptrdiff_t rows = size.height, cols = size.width;
|
||||
|
||||
std::vector<u8> _tmp;
|
||||
u8 *tmp = 0;
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
_tmp.assign(cols + 4,borderValue);
|
||||
tmp = &_tmp[2];
|
||||
}
|
||||
|
||||
for( ptrdiff_t y = 0; y < rows; y++ )
|
||||
{
|
||||
const u8* v0 = 0;
|
||||
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
|
||||
const u8* v2 = 0;
|
||||
// make border
|
||||
if (border == BORDER_MODE_REFLECT101) {
|
||||
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
|
||||
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
|
||||
} else if (border == BORDER_MODE_CONSTANT) {
|
||||
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
|
||||
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
|
||||
} else {
|
||||
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
|
||||
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
|
||||
}
|
||||
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
int16x8_t tcurr = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext = vmovq_n_s16(0x0);
|
||||
int16x8_t t0, t2;
|
||||
uint8x8_t xx0 = vmov_n_u8(0x0);
|
||||
uint8x8_t xx1 = vmov_n_u8(0x0);
|
||||
uint8x8_t xx2 = vmov_n_u8(0x0);
|
||||
ptrdiff_t x = 0;
|
||||
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
|
||||
for( ; x <= bcols; x += 8 )
|
||||
{
|
||||
internal::prefetch(v0 + x);
|
||||
internal::prefetch(v1 + x);
|
||||
internal::prefetch(v2 + x);
|
||||
|
||||
uint8x8_t x0 = vld1_u8(v0 + x);
|
||||
uint8x8_t x1 = vld1_u8(v1 + x);
|
||||
uint8x8_t x2 = vld1_u8(v2 + x);
|
||||
|
||||
if(x) {
|
||||
xx0 = xx1;
|
||||
xx1 = xx2;
|
||||
} else {
|
||||
xx1 = x1;
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
|
||||
{
|
||||
xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
|
||||
}
|
||||
else if (border == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
xx1 = vset_lane_u8(borderValue, x1, 7);
|
||||
}
|
||||
else if (border == BORDER_MODE_REFLECT101)
|
||||
{
|
||||
xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
|
||||
}
|
||||
}
|
||||
xx2 = x1;
|
||||
|
||||
if(x) {
|
||||
tcurr = tnext;
|
||||
}
|
||||
tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
|
||||
vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
|
||||
|
||||
if(!x) {
|
||||
tcurr = tnext;
|
||||
continue;
|
||||
}
|
||||
t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
|
||||
t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
|
||||
t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
|
||||
|
||||
vst1q_s16(drow + x - 8, t0);
|
||||
}
|
||||
|
||||
x -= 8;
|
||||
if(x == cols){
|
||||
x--;
|
||||
}
|
||||
|
||||
for( ; x < cols; x++ )
|
||||
{
|
||||
s16 nextx;
|
||||
s16 prevx;
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
|
||||
{
|
||||
prevx = x == 0 ? v1[0] : v1[x-1];
|
||||
nextx = x == cols-1 ? v1[x] : v1[x+1];
|
||||
}
|
||||
else if (border == BORDER_MODE_REFLECT101)
|
||||
{
|
||||
prevx = x == 0 ? v1[1] : v1[x-1];
|
||||
nextx = x == cols-1 ? v1[x-1] : v1[x+1];
|
||||
}
|
||||
else //if (border == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
prevx = x == 0 ? borderValue : v1[x-1];
|
||||
nextx = x == cols-1 ? borderValue : v1[x+1];
|
||||
}
|
||||
*(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)border;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Laplacian3OpenCV(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE border, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
|
||||
#ifdef CAROTENE_NEON
|
||||
ptrdiff_t rows = size.height, cols = size.width;
|
||||
|
||||
std::vector<u8> _tmp;
|
||||
u8 *tmp = 0;
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
_tmp.assign(cols + 4,borderValue);
|
||||
tmp = &_tmp[2];
|
||||
}
|
||||
|
||||
for( ptrdiff_t y = 0; y < rows; y++ )
|
||||
{
|
||||
const u8* v0 = 0;
|
||||
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
|
||||
const u8* v2 = 0;
|
||||
// make border
|
||||
if (border == BORDER_MODE_REFLECT101) {
|
||||
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
|
||||
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
|
||||
} else if (border == BORDER_MODE_CONSTANT) {
|
||||
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
|
||||
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
|
||||
} else {
|
||||
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
|
||||
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
|
||||
}
|
||||
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
int16x8_t tprev = vmovq_n_s16(0x0);
|
||||
int16x8_t tcurr = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext = vmovq_n_s16(0x0);
|
||||
int16x8_t tc = vmovq_n_s16(0x0);
|
||||
int16x8_t t0, t2, tcnext;
|
||||
ptrdiff_t x = 0;
|
||||
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
|
||||
for( ; x <= bcols; x += 8 )
|
||||
{
|
||||
internal::prefetch(v0 + x);
|
||||
internal::prefetch(v1 + x);
|
||||
internal::prefetch(v2 + x);
|
||||
|
||||
uint8x8_t x0 = vld1_u8(v0 + x);
|
||||
uint8x8_t x1 = vld1_u8(v1 + x);
|
||||
uint8x8_t x2 = vld1_u8(v2 + x);
|
||||
tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
|
||||
|
||||
if(x) {
|
||||
tprev = tcurr;
|
||||
tcurr = tnext;
|
||||
}
|
||||
tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
|
||||
|
||||
if(!x) {
|
||||
tcurr = tnext;
|
||||
tc = tcnext;
|
||||
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
|
||||
{
|
||||
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
|
||||
}
|
||||
else if (border == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
|
||||
}
|
||||
else if (border == BORDER_MODE_REFLECT101)
|
||||
{
|
||||
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
t0 = vextq_s16(tprev, tcurr, 7);
|
||||
t2 = vextq_s16(tcurr, tnext, 1);
|
||||
|
||||
t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
|
||||
tc = tcnext;
|
||||
|
||||
t0 = vshlq_n_s16(t0, 1);
|
||||
vst1q_s16(drow + x - 8, t0);
|
||||
}
|
||||
x -= 8;
|
||||
if(x == cols){
|
||||
x--;
|
||||
}
|
||||
|
||||
for( ; x < cols; x++ )
|
||||
{
|
||||
s16 nextx, nextx2;
|
||||
s16 prevx, prevx2;
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
|
||||
{
|
||||
prevx = x == 0 ? v0[0] : v0[x-1];
|
||||
prevx2 = x == 0 ? v2[0] : v2[x-1];
|
||||
nextx = x == cols-1 ? v0[x] : v0[x+1];
|
||||
nextx2 = x == cols-1 ? v2[x] : v2[x+1];
|
||||
}
|
||||
else if (border == BORDER_MODE_REFLECT101)
|
||||
{
|
||||
prevx = x == 0 ? v0[1] : v0[x-1];
|
||||
prevx2 = x == 0 ? v2[1] : v2[x-1];
|
||||
nextx = x == cols-1 ? v0[x-1] : v0[x+1];
|
||||
nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
|
||||
}
|
||||
else //if (border == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
prevx = x == 0 ? borderValue : v0[x-1];
|
||||
prevx2 = x == 0 ? borderValue : v2[x-1];
|
||||
nextx = x == cols-1 ? borderValue : v0[x+1];
|
||||
nextx2 = x == cols-1 ? borderValue : v2[x+1];
|
||||
}
|
||||
s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
|
||||
*(drow+x) = 2*res;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)border;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Laplacian5OpenCV(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE border, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
|
||||
#ifdef CAROTENE_NEON
|
||||
ptrdiff_t rows = size.height, cols = size.width;
|
||||
|
||||
std::vector<u8> _tmp;
|
||||
u8 *tmp = 0;
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
_tmp.assign(cols + 4,borderValue);
|
||||
tmp = &_tmp[2];
|
||||
}
|
||||
|
||||
for( ptrdiff_t y = 0; y < rows; y++ )
|
||||
{
|
||||
const u8* v0 = 0;
|
||||
const u8* v1 = 0;
|
||||
const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
|
||||
const u8* v3 = 0;
|
||||
const u8* v4 = 0;
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE) {
|
||||
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
|
||||
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
|
||||
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
|
||||
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
|
||||
} else if (border == BORDER_MODE_REFLECT) {
|
||||
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
|
||||
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
|
||||
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
|
||||
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
|
||||
} else if (border == BORDER_MODE_REFLECT101) {
|
||||
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
|
||||
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
|
||||
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
|
||||
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1)
|
||||
} else if (border == BORDER_MODE_CONSTANT) {
|
||||
v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
|
||||
v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
|
||||
v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
|
||||
v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
|
||||
}
|
||||
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
int16x8_t tnext, tc, t0;
|
||||
int16x8_t tnext2, tnext3;
|
||||
int16x8_t tnext1Old, tnext2Old, tnext3Old;
|
||||
int16x8_t tnext4OldOldOld, tnext5OldOldOld;
|
||||
|
||||
int16x8_t tcurr1 = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext1 = vmovq_n_s16(0x0);
|
||||
int16x8_t tprev1 = vmovq_n_s16(0x0);
|
||||
int16x8_t tpprev1 = vmovq_n_s16(0x0);
|
||||
int16x8_t tppprev1 = vmovq_n_s16(0x0);
|
||||
|
||||
int16x8_t tnext4Old = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext5Old = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
|
||||
int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
|
||||
|
||||
// do vertical convolution
|
||||
ptrdiff_t x = 0;
|
||||
const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
|
||||
for( ; x <= bcols; x += 8 )
|
||||
{
|
||||
internal::prefetch(v0 + x);
|
||||
internal::prefetch(v1 + x);
|
||||
internal::prefetch(v2 + x);
|
||||
internal::prefetch(v3 + x);
|
||||
internal::prefetch(v4 + x);
|
||||
|
||||
uint8x8_t x0 = vld1_u8(v0 + x);
|
||||
uint8x8_t x1 = vld1_u8(v1 + x);
|
||||
uint8x8_t x2 = vld1_u8(v2 + x);
|
||||
uint8x8_t x3 = vld1_u8(v3 + x);
|
||||
uint8x8_t x4 = vld1_u8(v4 + x);
|
||||
if(x) {
|
||||
tcurr1 = tnext1;
|
||||
}
|
||||
|
||||
tnext4OldOldOld = tnext4Old;
|
||||
tnext5OldOldOld = tnext5Old;
|
||||
tnext1Old = tnext1OldOld;
|
||||
tnext2Old = tnext2OldOld;
|
||||
tnext3Old = tnext3OldOld;
|
||||
tnext4Old = tnext4OldOld;
|
||||
tnext5Old = tnext5OldOld;
|
||||
|
||||
tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
|
||||
tnext3 = vshlq_n_s16(tnext3, 1);
|
||||
|
||||
tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
|
||||
tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
|
||||
tnext2 = vsubq_s16(tc, tnext);
|
||||
|
||||
tnext1 = vaddq_s16(tnext3, tnext2);
|
||||
// tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
|
||||
|
||||
tnext2 = vshlq_n_s16(tnext2, 1);
|
||||
// tnext2 = 2*x4 - 4*x2 + 2*x0
|
||||
|
||||
tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
|
||||
// tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
|
||||
|
||||
tnext1OldOld = tnext1;
|
||||
tnext2OldOld = tnext2;
|
||||
tnext3OldOld = tnext3;
|
||||
tnext4OldOld = tnext2;
|
||||
tnext5OldOld = tnext1;
|
||||
|
||||
if(x) {
|
||||
tnext1 = vextq_s16(tnext1Old, tnext1, 2);
|
||||
tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
|
||||
tprev1 = tnext3Old;
|
||||
|
||||
if(x!=8) {
|
||||
tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
|
||||
tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
|
||||
}
|
||||
}
|
||||
|
||||
if(!x) {
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE) {
|
||||
tpprev1 = vextq_s16(tnext2, tnext2, 7);
|
||||
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
|
||||
|
||||
tprev1 = vextq_s16(tnext1, tnext1, 6);
|
||||
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
|
||||
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
|
||||
} else if (border == BORDER_MODE_REFLECT) {
|
||||
tpprev1 = vextq_s16(tnext2, tnext2, 7);
|
||||
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
|
||||
|
||||
tprev1 = vextq_s16(tnext1, tnext1, 6);
|
||||
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
|
||||
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
|
||||
} else if (border == BORDER_MODE_REFLECT101) {
|
||||
tpprev1 = vextq_s16(tnext2, tnext2, 7);
|
||||
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
|
||||
|
||||
tprev1 = vextq_s16(tnext1, tnext1, 6);
|
||||
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
|
||||
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
|
||||
} else if (border == BORDER_MODE_CONSTANT) {
|
||||
tpprev1 = vextq_s16(tnext2, tnext2, 7);
|
||||
tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
|
||||
|
||||
tprev1 = vextq_s16(tnext1, tnext1, 6);
|
||||
tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
|
||||
tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
|
||||
}
|
||||
tppprev1 = tprev1;
|
||||
continue;
|
||||
}
|
||||
|
||||
t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
|
||||
t0 = vaddq_s16(t0, t0);
|
||||
vst1q_s16(drow + x - 8, t0);
|
||||
}
|
||||
x -= 8;
|
||||
if(x >= cols - 1)
|
||||
x = cols-2;
|
||||
|
||||
s16 pprevx = 0;
|
||||
s16 prevx = 0;
|
||||
s16 nextx = 0;
|
||||
s16 nnextx = 0;
|
||||
|
||||
for( ; x < cols; x++ )
|
||||
{
|
||||
if (x == 0) {
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE) {
|
||||
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
|
||||
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
|
||||
} else if (border == BORDER_MODE_REFLECT) {
|
||||
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
|
||||
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
|
||||
} else if (border == BORDER_MODE_REFLECT101) {
|
||||
pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
|
||||
prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
|
||||
} else if (border == BORDER_MODE_CONSTANT) {
|
||||
pprevx = 8 * borderValue;
|
||||
prevx = 0;
|
||||
}
|
||||
} else if (x == 1) {
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
|
||||
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
|
||||
} else if (border == BORDER_MODE_REFLECT101) {
|
||||
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
|
||||
} else if (border == BORDER_MODE_CONSTANT) {
|
||||
pprevx = 8 * borderValue;
|
||||
}
|
||||
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
|
||||
} else {
|
||||
pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
|
||||
prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
|
||||
}
|
||||
s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
|
||||
if (x == cols-1) {
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE) {
|
||||
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
|
||||
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
|
||||
} else if (border == BORDER_MODE_REFLECT) {
|
||||
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
|
||||
nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
|
||||
} else if (border == BORDER_MODE_REFLECT101) {
|
||||
nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
|
||||
nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
|
||||
} else if (border == BORDER_MODE_CONSTANT) {
|
||||
nextx = 0;
|
||||
nnextx = 8 * borderValue;
|
||||
}
|
||||
} else if (x == cols-2) {
|
||||
// make border
|
||||
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
|
||||
nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
|
||||
} else if (border == BORDER_MODE_REFLECT101) {
|
||||
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
|
||||
} else if (border == BORDER_MODE_CONSTANT) {
|
||||
nnextx = 8 * borderValue;
|
||||
}
|
||||
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
|
||||
} else {
|
||||
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
|
||||
nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
|
||||
}
|
||||
s16 res = pprevx + prevx + currx + nextx + nnextx;
|
||||
*(drow+x) = 2*res;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)border;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
160
3rdparty/carotene/src/magnitude.cpp
vendored
Normal file
160
3rdparty/carotene/src/magnitude.cpp
vendored
Normal file
@ -0,0 +1,160 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
struct Magnitude
|
||||
{
|
||||
typedef s16 type;
|
||||
|
||||
void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
|
||||
int16x8_t & v_dst) const
|
||||
{
|
||||
int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
|
||||
float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
|
||||
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
|
||||
v_src0_p = vget_high_s16(v_src0);
|
||||
v_src1_p = vget_high_s16(v_src1);
|
||||
float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
|
||||
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
|
||||
|
||||
int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
|
||||
int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
|
||||
|
||||
v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
|
||||
}
|
||||
|
||||
void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
|
||||
int16x4_t & v_dst) const
|
||||
{
|
||||
float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
|
||||
vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
|
||||
int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
|
||||
v_dst = vqmovn_s32(v_sqrt);
|
||||
}
|
||||
|
||||
void operator() (const short * src0, const short * src1, short * dst) const
|
||||
{
|
||||
f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
|
||||
dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
|
||||
}
|
||||
};
|
||||
|
||||
struct MagnitudeF32
|
||||
{
|
||||
typedef f32 type;
|
||||
|
||||
void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
|
||||
float32x4_t & v_dst) const
|
||||
{
|
||||
v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
|
||||
}
|
||||
|
||||
void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
|
||||
float32x2_t & v_dst) const
|
||||
{
|
||||
v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
|
||||
}
|
||||
|
||||
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
|
||||
{
|
||||
dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void magnitude(const Size2D &size,
|
||||
const s16 * src0Base, ptrdiff_t src0Stride,
|
||||
const s16 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
Magnitude());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void magnitude(const Size2D &size,
|
||||
const f32 * src0Base, ptrdiff_t src0Stride,
|
||||
const f32 * src1Base, ptrdiff_t src1Stride,
|
||||
f32 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
MagnitudeF32());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
163
3rdparty/carotene/src/meanstddev.cpp
vendored
Normal file
163
3rdparty/carotene/src/meanstddev.cpp
vendored
Normal file
@ -0,0 +1,163 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
void meanStdDev(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
f32 * pMean, f32 * pStdDev)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
f64 fsum = 0.0f, fsqsum = 0.0f;
|
||||
sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
|
||||
|
||||
// calc mean and stddev
|
||||
f64 itotal = 1.0 / size.total();
|
||||
f64 mean = fsum * itotal;
|
||||
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
|
||||
|
||||
if (pMean)
|
||||
*pMean = mean;
|
||||
if (pStdDev)
|
||||
*pStdDev = stddev;
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)pMean;
|
||||
(void)pStdDev;
|
||||
#endif
|
||||
}
|
||||
|
||||
void meanStdDev(const Size2D &size,
|
||||
const u16 * srcBase, ptrdiff_t srcStride,
|
||||
f32 * pMean, f32 * pStdDev)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
|
||||
f64 fsum = 0.0f, fsqsum = 0.0f;
|
||||
|
||||
f32 arsum[8];
|
||||
uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
|
||||
float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
size_t j = 0u;
|
||||
|
||||
while (j < roiw4)
|
||||
{
|
||||
size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
|
||||
v_sum = v_zero;
|
||||
v_sqsum = v_zero_f;
|
||||
|
||||
for ( ; j + 16 < blockSize ; j += 16)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
|
||||
|
||||
// 0
|
||||
uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
|
||||
uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
|
||||
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
|
||||
float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
|
||||
float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
|
||||
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
|
||||
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
|
||||
|
||||
// 1
|
||||
v_srclo = vmovl_u16(vget_low_u16(v_src1));
|
||||
v_srchi = vmovl_u16(vget_high_u16(v_src1));
|
||||
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
|
||||
v_srclo_f = vcvtq_f32_u32(v_srclo);
|
||||
v_srchi_f = vcvtq_f32_u32(v_srchi);
|
||||
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
|
||||
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
|
||||
}
|
||||
|
||||
for ( ; j < blockSize; j += 4)
|
||||
{
|
||||
uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
|
||||
float32x4_t v_src_f = vcvtq_f32_u32(v_src);
|
||||
v_sum = vaddq_u32(v_sum, v_src);
|
||||
v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
|
||||
}
|
||||
|
||||
vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
|
||||
vst1q_f32(arsum + 4, v_sqsum);
|
||||
|
||||
fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
|
||||
fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
|
||||
}
|
||||
|
||||
// collect a few last elements in the current row
|
||||
for ( ; j < size.width; ++j)
|
||||
{
|
||||
f32 srcval = src[j];
|
||||
fsum += srcval;
|
||||
fsqsum += srcval * srcval;
|
||||
}
|
||||
}
|
||||
|
||||
// calc mean and stddev
|
||||
f64 itotal = 1.0 / size.total();
|
||||
f64 mean = fsum * itotal;
|
||||
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
|
||||
|
||||
if (pMean)
|
||||
*pMean = mean;
|
||||
if (pStdDev)
|
||||
*pStdDev = stddev;
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)pMean;
|
||||
(void)pStdDev;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
227
3rdparty/carotene/src/median_filter.cpp
vendored
Normal file
227
3rdparty/carotene/src/median_filter.cpp
vendored
Normal file
@ -0,0 +1,227 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
/*
|
||||
* The code here is based on the code in
|
||||
* <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
|
||||
* See also <http://ndevilla.free.fr/median/median/index.html>.
|
||||
*/
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
namespace {
|
||||
|
||||
uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
|
||||
{
|
||||
u8 buf[16+8];
|
||||
vst1q_u8(buf+cn, r);
|
||||
for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
|
||||
return vld1q_u8(buf);
|
||||
}
|
||||
|
||||
uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
|
||||
{
|
||||
u8 buf[8+8];
|
||||
vst1_u8(buf, r);
|
||||
for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
|
||||
return vld1_u8(buf+cn);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
//o------^-------^-----------------------------o 0
|
||||
// | |
|
||||
//o--^---v---^---|-------^---------------------o 1
|
||||
// | | | |
|
||||
//o--v-------v---|-------|-^-------^-------^---o 2
|
||||
// | | | | |
|
||||
//o------^-------v-----^-|-|-------|-------|---o 3
|
||||
// | | | | | |
|
||||
//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
|
||||
// | | | | | | |
|
||||
//o--v-------v---^-|---|---v---|-------|-------o 5
|
||||
// | | | | |
|
||||
//o------^-------|-|---v-------|-------v-------o 6
|
||||
// | | | |
|
||||
//o--^---v---^---|-v-----------v---------------o 7
|
||||
// | | |
|
||||
//o--v-------v---v-----------------------------o 8
|
||||
|
||||
#define ELT(num, level) v ## num ## _lv ## level
|
||||
#define PIX_SORT(a, alvl, b, blvl, newlvl) \
|
||||
PIX_MIN(a, alvl, b, blvl, newlvl); \
|
||||
PIX_MAX(a, alvl, b, blvl, newlvl);
|
||||
|
||||
#define SORT9 \
|
||||
PIX_SORT(1, 00, 2, 00, 01); \
|
||||
PIX_SORT(4, 00, 5, 00, 02); \
|
||||
PIX_SORT(7, 00, 8, 00, 03); \
|
||||
PIX_SORT(0, 00, 1, 01, 04); \
|
||||
PIX_SORT(3, 00, 4, 02, 05); \
|
||||
PIX_SORT(6, 00, 7, 03, 06); \
|
||||
PIX_SORT(1, 04, 2, 01, 07); \
|
||||
PIX_SORT(4, 05, 5, 02, 08); \
|
||||
PIX_SORT(7, 06, 8, 03, 09); \
|
||||
PIX_MAX (0, 04, 3, 05, 10); \
|
||||
PIX_MIN (5, 08, 8, 09, 11); \
|
||||
PIX_SORT(4, 08, 7, 09, 12); \
|
||||
PIX_MAX (3, 10, 6, 06, 13); \
|
||||
PIX_MAX (1, 07, 4, 12, 14); \
|
||||
PIX_MIN (2, 07, 5, 11, 15); \
|
||||
PIX_MIN (4, 14, 7, 12, 16); \
|
||||
PIX_SORT(4, 16, 2, 15, 17); \
|
||||
PIX_MAX (6, 13, 4, 17, 18); \
|
||||
PIX_MIN (4, 18, 2, 17, 19);
|
||||
|
||||
#endif
|
||||
|
||||
bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
|
||||
{
|
||||
return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
|
||||
}
|
||||
|
||||
void medianFilter3x3(const Size2D &size, u32 numChannels,
|
||||
const u8 *srcBase, ptrdiff_t srcStride,
|
||||
const Margin &srcMargin,
|
||||
u8 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
|
||||
#ifdef CAROTENE_NEON
|
||||
u32 cn = numChannels;
|
||||
size_t colsn = size.width * cn;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i) {
|
||||
const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
|
||||
const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
|
||||
const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
|
||||
u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
{
|
||||
uint8x16_t v3_lv00 = vld1q_u8(psrc0);
|
||||
uint8x16_t v4_lv00 = vld1q_u8(psrc1);
|
||||
uint8x16_t v5_lv00 = vld1q_u8(psrc2);
|
||||
uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
|
||||
uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
|
||||
uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
|
||||
uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
|
||||
uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
|
||||
uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
|
||||
|
||||
goto medianBlur3x3_mainBody;
|
||||
|
||||
for (; j < colsn - 16; j += 16) {
|
||||
internal::prefetch(psrc0 + j);
|
||||
internal::prefetch(psrc1 + j);
|
||||
internal::prefetch(psrc2 + j);
|
||||
|
||||
v0_lv00 = vld1q_u8(psrc0 + j - cn);
|
||||
v1_lv00 = vld1q_u8(psrc1 + j - cn);
|
||||
v2_lv00 = vld1q_u8(psrc2 + j - cn);
|
||||
v3_lv00 = vld1q_u8(psrc0 + j);
|
||||
v4_lv00 = vld1q_u8(psrc1 + j);
|
||||
v5_lv00 = vld1q_u8(psrc2 + j);
|
||||
v6_lv00 = vld1q_u8(psrc0 + j + cn);
|
||||
v7_lv00 = vld1q_u8(psrc1 + j + cn);
|
||||
v8_lv00 = vld1q_u8(psrc2 + j + cn);
|
||||
|
||||
medianBlur3x3_mainBody:
|
||||
|
||||
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
|
||||
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
|
||||
SORT9;
|
||||
#undef PIX_MAX
|
||||
#undef PIX_MIN
|
||||
|
||||
vst1q_u8(pdst + j, v4_lv19);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
size_t k = colsn - 8;
|
||||
uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
|
||||
uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
|
||||
uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
|
||||
uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
|
||||
uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
|
||||
uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
|
||||
uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
|
||||
uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
|
||||
uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
|
||||
|
||||
goto medianBlur3x3_tailBody;
|
||||
|
||||
for (; k >= j - 8; k -= 8) {
|
||||
v0_lv00 = vld1_u8(psrc0 + k - cn);
|
||||
v1_lv00 = vld1_u8(psrc1 + k - cn);
|
||||
v2_lv00 = vld1_u8(psrc2 + k - cn);
|
||||
v3_lv00 = vld1_u8(psrc0 + k);
|
||||
v4_lv00 = vld1_u8(psrc1 + k);
|
||||
v5_lv00 = vld1_u8(psrc2 + k);
|
||||
v6_lv00 = vld1_u8(psrc0 + k + cn);
|
||||
v7_lv00 = vld1_u8(psrc1 + k + cn);
|
||||
v8_lv00 = vld1_u8(psrc2 + k + cn);
|
||||
|
||||
medianBlur3x3_tailBody:
|
||||
|
||||
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
|
||||
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
|
||||
SORT9;
|
||||
#undef PIX_MAX
|
||||
#undef PIX_MIN
|
||||
|
||||
vst1_u8(pdst + k, v4_lv19);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)numChannels;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)srcMargin;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
139
3rdparty/carotene/src/min_max.cpp
vendored
Normal file
139
3rdparty/carotene/src/min_max.cpp
vendored
Normal file
@ -0,0 +1,139 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct Min
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vminq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vmin(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = std::min(src0[0], src1[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct Max
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vmaxq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vmax(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = std::max(src0[0], src1[0]);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#define IMPL_OP(fun, op, type) \
|
||||
void fun(const Size2D &size, \
|
||||
const type * src0Base, ptrdiff_t src0Stride, \
|
||||
const type * src1Base, ptrdiff_t src1Stride, \
|
||||
type * dstBase, ptrdiff_t dstStride) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
internal::vtransform(size, \
|
||||
src0Base, src0Stride, \
|
||||
src1Base, src1Stride, \
|
||||
dstBase, dstStride, op<type>()); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define IMPL_OP(fun, op, type) \
|
||||
void fun(const Size2D &, \
|
||||
const type *, ptrdiff_t, \
|
||||
const type *, ptrdiff_t, \
|
||||
type *, ptrdiff_t) \
|
||||
{ \
|
||||
internal::assertSupportedConfiguration(); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
|
||||
|
||||
IMPL_MINMAX(u8)
|
||||
IMPL_MINMAX(s8)
|
||||
IMPL_MINMAX(u16)
|
||||
IMPL_MINMAX(s16)
|
||||
IMPL_MINMAX(u32)
|
||||
IMPL_MINMAX(s32)
|
||||
IMPL_MINMAX(f32)
|
||||
|
||||
} // namespace CAROTENE_NS
|
1340
3rdparty/carotene/src/minmaxloc.cpp
vendored
Normal file
1340
3rdparty/carotene/src/minmaxloc.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
728
3rdparty/carotene/src/morph.cpp
vendored
Normal file
728
3rdparty/carotene/src/morph.cpp
vendored
Normal file
@ -0,0 +1,728 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
|
||||
{
|
||||
return isSupportedConfiguration() && size.width >= 16 &&
|
||||
(border == BORDER_MODE_CONSTANT ||
|
||||
border == BORDER_MODE_REPLICATE);
|
||||
}
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
struct ErodeVecOp
|
||||
{
|
||||
ErodeVecOp():borderValue(0){}
|
||||
|
||||
ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
|
||||
borderValue(borderValue_)
|
||||
{
|
||||
if (border == BORDER_MODE_REPLICATE)
|
||||
borderValue = std::numeric_limits<u8>::max();
|
||||
}
|
||||
|
||||
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
|
||||
{
|
||||
return vminq_u8(a, b);
|
||||
}
|
||||
|
||||
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
|
||||
{
|
||||
return vmin_u8(a, b);
|
||||
}
|
||||
|
||||
inline u8 operator()(u8 a, u8 b) const
|
||||
{
|
||||
return std::min(a, b);
|
||||
}
|
||||
|
||||
u8 borderValue;
|
||||
};
|
||||
|
||||
struct DilateVecOp
|
||||
{
|
||||
DilateVecOp():borderValue(0){}
|
||||
|
||||
DilateVecOp(BORDER_MODE border, u8 borderValue_) :
|
||||
borderValue(borderValue_)
|
||||
{
|
||||
if (border == BORDER_MODE_REPLICATE)
|
||||
borderValue = std::numeric_limits<u8>::min();
|
||||
}
|
||||
|
||||
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
|
||||
{
|
||||
return vmaxq_u8(a, b);
|
||||
}
|
||||
|
||||
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
|
||||
{
|
||||
return vmax_u8(a, b);
|
||||
}
|
||||
|
||||
inline u8 operator()(u8 a, u8 b) const
|
||||
{
|
||||
return std::max(a, b);
|
||||
}
|
||||
|
||||
u8 borderValue;
|
||||
};
|
||||
|
||||
template <typename VecOp>
|
||||
void morph3x3(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE border, const VecOp & vop)
|
||||
{
|
||||
u8 borderValue = vop.borderValue;
|
||||
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
|
||||
|
||||
const uint8x16_t v_zero = vdupq_n_u8(0);
|
||||
const uint8x16_t v_border = vdupq_n_u8(borderValue);
|
||||
|
||||
uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
|
||||
uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
|
||||
|
||||
for (ptrdiff_t y = 0; y < height; ++y)
|
||||
{
|
||||
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
|
||||
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
|
||||
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
|
||||
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
u8 prevx = 0, currx = 0, nextx = 0;
|
||||
ptrdiff_t x = 0;
|
||||
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
|
||||
|
||||
// perform vertical convolution
|
||||
for ( ; x <= bwidth; x += 16)
|
||||
{
|
||||
internal::prefetch(srow0 + x);
|
||||
internal::prefetch(srow1 + x);
|
||||
internal::prefetch(srow2 + x);
|
||||
|
||||
uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
|
||||
uint8x16_t x1 = vld1q_u8(srow1 + x);
|
||||
uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
|
||||
|
||||
// calculate values for plain CPU part below if needed
|
||||
if (x + 16 >= bwidth)
|
||||
{
|
||||
ptrdiff_t x3 = x == width ? width - 1 : x;
|
||||
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
|
||||
|
||||
if (border == BORDER_MODE_CONSTANT && x4 < 0)
|
||||
prevx = borderValue;
|
||||
else
|
||||
prevx = vop(srow1[x4],
|
||||
vop(srow2 ? srow2[x4] : borderValue,
|
||||
srow0 ? srow0[x4] : borderValue));
|
||||
|
||||
currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
|
||||
}
|
||||
|
||||
// make shift
|
||||
if (x)
|
||||
{
|
||||
tprev = tcurr;
|
||||
tcurr = tnext;
|
||||
}
|
||||
|
||||
// and calculate next value
|
||||
tnext = vop(vop(x0, x1), x2);
|
||||
|
||||
// make extrapolation for the first elements
|
||||
if (!x)
|
||||
{
|
||||
// make border
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
tcurr = v_border;
|
||||
else if (border == BORDER_MODE_REPLICATE)
|
||||
tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// combine 3 "shifted" vectors
|
||||
t0 = vextq_u8(tprev, tcurr, 15);
|
||||
t1 = tcurr;
|
||||
t2 = vextq_u8(tcurr, tnext, 1);
|
||||
|
||||
// and add them
|
||||
t0 = vop(t0, vop(t1, t2));
|
||||
|
||||
vst1q_u8(drow + x - 16, t0);
|
||||
}
|
||||
|
||||
x -= 16;
|
||||
if (x == width)
|
||||
--x;
|
||||
|
||||
for ( ; x < width; ++x)
|
||||
{
|
||||
// make extrapolation for the last elements
|
||||
if (x + 1 >= width)
|
||||
{
|
||||
if (border == BORDER_MODE_CONSTANT)
|
||||
nextx = borderValue;
|
||||
else if (border == BORDER_MODE_REPLICATE)
|
||||
nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
|
||||
}
|
||||
else
|
||||
nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
|
||||
srow0 ? srow0[x + 1] : borderValue),
|
||||
srow1[x + 1]);
|
||||
|
||||
drow[x] = vop(prevx, vop(currx, nextx));
|
||||
|
||||
// make shift
|
||||
prevx = currx;
|
||||
currx = nextx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void erode3x3(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE border, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
|
||||
#ifdef CAROTENE_NEON
|
||||
morph3x3(size,
|
||||
srcBase, srcStride,
|
||||
dstBase, dstStride,
|
||||
border, ErodeVecOp(border, borderValue));
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)border;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
void dilate3x3(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE border, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
|
||||
#ifdef CAROTENE_NEON
|
||||
morph3x3(size,
|
||||
srcBase, srcStride,
|
||||
dstBase, dstStride,
|
||||
border, DilateVecOp(border, borderValue));
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)border;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
namespace {
|
||||
|
||||
template<class VecUpdate>
|
||||
void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
|
||||
{
|
||||
size_t i, j, k;
|
||||
size_t width16 = (width & -16) * cn;
|
||||
size_t width8 = (width & -8) * cn;
|
||||
width *= cn;
|
||||
|
||||
if (ksize == 1)
|
||||
{
|
||||
for (i = 0; i < width; i++)
|
||||
dst[i] = src[i];
|
||||
return;
|
||||
}
|
||||
|
||||
ksize = ksize*cn;
|
||||
VecUpdate updateOp;
|
||||
switch(cn)
|
||||
{
|
||||
case 1:
|
||||
for (i = 0; i < width16; i += 16)
|
||||
{
|
||||
const u8* sptr = src + i;
|
||||
uint8x16_t s = vld1q_u8(sptr);
|
||||
internal::prefetch(sptr);
|
||||
|
||||
for( k = 1; k < ksize; ++k)
|
||||
s = updateOp(s, vld1q_u8(sptr + k));
|
||||
|
||||
vst1q_u8(dst + i, s);
|
||||
}
|
||||
|
||||
for (; i < width8; i += 8)
|
||||
{
|
||||
const u8* sptr = src + i;
|
||||
uint8x8_t s = vld1_u8(sptr);
|
||||
internal::prefetch(sptr);
|
||||
|
||||
for( k = 1; k < ksize; ++k)
|
||||
s = updateOp(s, vld1_u8(sptr + k));
|
||||
|
||||
vst1_u8(dst + i, s);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
for (i = 0; i < width16; i += 16)
|
||||
{
|
||||
uint8x16_t s = vld1q_u8(src + i);
|
||||
internal::prefetch(src + i);
|
||||
|
||||
for (k = cn; k < ksize; k += cn)
|
||||
s = updateOp(s, vld1q_u8(src + i + k));
|
||||
|
||||
vst1q_u8(dst + i, s);
|
||||
}
|
||||
|
||||
for (; i < width8; i += 8)
|
||||
{
|
||||
uint8x8_t s = vld1_u8(src + i);
|
||||
internal::prefetch(src + i);
|
||||
|
||||
for (k = cn; k < ksize; k += cn)
|
||||
s = updateOp(s, vld1_u8(src + i + k));
|
||||
|
||||
vst1_u8(dst + i, s);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
ptrdiff_t i0 = i;
|
||||
for( k = 0; k < (size_t)cn; k++, src++, dst++ )
|
||||
{
|
||||
for( i = i0; i <= width - cn*2; i += cn*2 )
|
||||
{
|
||||
const u8* s = src + i;
|
||||
u8 m = s[cn];
|
||||
for( j = cn*2; j < ksize; j += cn )
|
||||
m = updateOp(m, s[j]);
|
||||
dst[i] = updateOp(m, s[0]);
|
||||
dst[i+cn] = updateOp(m, s[j]);
|
||||
}
|
||||
|
||||
for( ; i < width; i += cn )
|
||||
{
|
||||
const u8* s = src + i;
|
||||
u8 m = s[0];
|
||||
for( j = cn; j < ksize; j += cn )
|
||||
m = updateOp(m, s[j]);
|
||||
dst[i] = m;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class VecUpdate>
|
||||
void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
|
||||
{
|
||||
size_t i, k;
|
||||
size_t width32 = width & -32;
|
||||
VecUpdate updateOp;
|
||||
|
||||
uint8x16_t x0,x1,s0,s1;
|
||||
if (ksize == 3)
|
||||
{
|
||||
for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
|
||||
{
|
||||
for (i = 0; i < width32; i += 32)
|
||||
{
|
||||
const u8* sptr = src[1] + i;
|
||||
s0 = vld1q_u8(sptr);
|
||||
s1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
|
||||
sptr = src[2] + i;
|
||||
x0 = vld1q_u8(sptr);
|
||||
x1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
|
||||
s0 = updateOp(s0, x0);
|
||||
s1 = updateOp(s1, x1);
|
||||
|
||||
sptr = src[0] + i;
|
||||
x0 = vld1q_u8(sptr);
|
||||
x1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
|
||||
vst1q_u8(dst+i, updateOp(s0, x0));
|
||||
vst1q_u8(dst+i+16, updateOp(s1, x1));
|
||||
|
||||
sptr = src[3] + i;
|
||||
x0 = vld1q_u8(sptr);
|
||||
x1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
|
||||
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
|
||||
|
||||
}
|
||||
for(; i < width; i++ )
|
||||
{
|
||||
u8 s = src[1][i];
|
||||
|
||||
for( k = 2; k < ksize; k++ )
|
||||
s = updateOp(s, src[k][i]);
|
||||
|
||||
dst[i] = updateOp(s, src[0][i]);
|
||||
dst[i+dststep] = updateOp(s, src[k][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (ksize > 1)
|
||||
for (; count > 1; count -= 2, dst += dststep*2, src += 2)
|
||||
{
|
||||
for (i = 0; i < width32; i += 32)
|
||||
{
|
||||
const u8* sptr = src[1] + i;
|
||||
s0 = vld1q_u8(sptr);
|
||||
s1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
for (k = 2; k < ksize; k++)
|
||||
{
|
||||
sptr = src[k] + i;
|
||||
x0 = vld1q_u8(sptr);
|
||||
x1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
|
||||
s0 = updateOp(s0, x0);
|
||||
s1 = updateOp(s1, x1);
|
||||
}
|
||||
|
||||
sptr = src[0] + i;
|
||||
x0 = vld1q_u8(sptr);
|
||||
x1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
|
||||
vst1q_u8(dst+i, updateOp(s0, x0));
|
||||
vst1q_u8(dst+i+16, updateOp(s1, x1));
|
||||
|
||||
sptr = src[k] + i;
|
||||
x0 = vld1q_u8(sptr);
|
||||
x1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
|
||||
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
|
||||
}
|
||||
for(; i < width; i++ )
|
||||
{
|
||||
u8 s = src[1][i];
|
||||
|
||||
for( k = 2; k < ksize; k++ )
|
||||
s = updateOp(s, src[k][i]);
|
||||
|
||||
dst[i] = updateOp(s, src[0][i]);
|
||||
dst[i+dststep] = updateOp(s, src[k][i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (; count > 0; count--, dst += dststep, src++)
|
||||
{
|
||||
for (i = 0; i < width32; i += 32)
|
||||
{
|
||||
const u8* sptr = src[0] + i;
|
||||
s0 = vld1q_u8(sptr);
|
||||
s1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
|
||||
for (k = 1; k < ksize; k++)
|
||||
{
|
||||
sptr = src[k] + i;
|
||||
x0 = vld1q_u8(sptr);
|
||||
x1 = vld1q_u8(sptr + 16);
|
||||
internal::prefetch(sptr);
|
||||
s0 = updateOp(s0, x0);
|
||||
s1 = updateOp(s1, x1);
|
||||
}
|
||||
|
||||
vst1q_u8(dst + i, s0);
|
||||
vst1q_u8(dst + i + 16, s1);
|
||||
}
|
||||
for(; i < width; i++ )
|
||||
{
|
||||
u8 s = src[0][i];
|
||||
for( k = 1; k < ksize; k++ )
|
||||
s = updateOp(s, src[k][i]);
|
||||
dst[i] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class Op>
|
||||
inline void morphology(const Size2D &ssize, u32 cn,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
const Size2D &ksize,
|
||||
size_t anchorX, size_t anchorY,
|
||||
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
|
||||
const u8 * borderValues, Margin borderMargin)
|
||||
{
|
||||
//Temporary buffers common for all iterations
|
||||
std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
|
||||
u8* srcRow = &_srcRow[0];
|
||||
|
||||
size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
|
||||
std::vector<u8*> _rows(bufRows);
|
||||
u8** rows = &_rows[0];
|
||||
|
||||
// adjust swidthcn so that the used part of buffers stays compact in memory
|
||||
ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
|
||||
std::vector<u8> _ringBuf(swidthcn*bufRows+16);
|
||||
u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
|
||||
|
||||
size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
|
||||
std::vector<ptrdiff_t> _borderTab(borderLength);
|
||||
ptrdiff_t * borderTab = &_borderTab[0];
|
||||
|
||||
std::vector<u8> _constBorderValue;
|
||||
std::vector<u8> _constBorderRow;
|
||||
u8 * constBorderValue = NULL;
|
||||
u8 * constBorderRow = NULL;
|
||||
if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
|
||||
{
|
||||
_constBorderValue.resize(borderLength);
|
||||
constBorderValue = &_constBorderValue[0];
|
||||
size_t i;
|
||||
for(i = 0; i < cn; i++)
|
||||
constBorderValue[i] = borderValues[i];
|
||||
for(; i < borderLength; i++)
|
||||
constBorderValue[i] = constBorderValue[i-cn];
|
||||
|
||||
if( columnBorderType == BORDER_MODE_CONSTANT )
|
||||
{
|
||||
_constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
|
||||
constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
|
||||
size_t N = (ssize.width + ksize.width - 1)*cn;
|
||||
for( i = 0; i < N; i += borderLength )
|
||||
{
|
||||
size_t n = std::min( borderLength, N - i );
|
||||
for(size_t j = 0; j < n; j++)
|
||||
srcRow[i+j] = constBorderValue[j];
|
||||
}
|
||||
MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
|
||||
}
|
||||
}
|
||||
|
||||
Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
|
||||
ssize.height + borderMargin.top + borderMargin.bottom);
|
||||
|
||||
ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
|
||||
ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
|
||||
// recompute border tables
|
||||
if( dx1 > 0 || dx2 > 0 )
|
||||
{
|
||||
if( rowBorderType == BORDER_MODE_CONSTANT )
|
||||
{
|
||||
memcpy( srcRow, &constBorderValue[0], dx1*cn );
|
||||
memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
|
||||
}
|
||||
else
|
||||
{
|
||||
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
|
||||
|
||||
ptrdiff_t wholeWidth = wholeSize.width;
|
||||
|
||||
ptrdiff_t i, j;
|
||||
for( i = 0; i < dx1; i++ )
|
||||
{
|
||||
ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
|
||||
for( j = 0; j < (ptrdiff_t)cn; j++ )
|
||||
borderTab[i*cn + j] = p0 + j;
|
||||
}
|
||||
|
||||
for( i = 0; i < dx2; i++ )
|
||||
{
|
||||
ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
|
||||
for( j = 0; j < (ptrdiff_t)cn; j++ )
|
||||
borderTab[(i + dx1)*cn + j] = p0 + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ptrdiff_t startY, startY0, endY, rowCount;
|
||||
startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
|
||||
endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
|
||||
|
||||
const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
|
||||
u8* dst = dstBase;
|
||||
|
||||
ptrdiff_t width = ssize.width, kwidth = ksize.width;
|
||||
ptrdiff_t kheight = ksize.height, ay = anchorY;
|
||||
ptrdiff_t width1 = ssize.width + kwidth - 1;
|
||||
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
|
||||
bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
|
||||
ptrdiff_t dy = 0, i = 0;
|
||||
|
||||
src -= xofs1*cn;
|
||||
ptrdiff_t count = endY - startY;
|
||||
|
||||
rowCount = 0;
|
||||
for(;; dst += dstStride*i, dy += i)
|
||||
{
|
||||
ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
|
||||
dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
|
||||
dcount = std::min(dcount, count);
|
||||
count -= dcount;
|
||||
for( ; dcount-- > 0; src += srcStride )
|
||||
{
|
||||
ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
|
||||
u8* brow = ringBuf + bi*swidthcn;
|
||||
|
||||
if( (size_t)(++rowCount) > bufRows )
|
||||
{
|
||||
--rowCount;
|
||||
++startY;
|
||||
}
|
||||
|
||||
memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
|
||||
|
||||
if( makeBorder )
|
||||
{
|
||||
for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
|
||||
srcRow[i] = src[borderTab[i]];
|
||||
for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
|
||||
srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
|
||||
}
|
||||
|
||||
MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
|
||||
}
|
||||
|
||||
ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
|
||||
for( i = 0; i < max_i; i++ )
|
||||
{
|
||||
ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
|
||||
wholeSize.height, columnBorderType);
|
||||
if( srcY < 0 ) // can happen only with constant border type
|
||||
rows[i] = constBorderRow;
|
||||
else
|
||||
{
|
||||
if( srcY >= startY + rowCount )
|
||||
break;
|
||||
ptrdiff_t bi = (srcY - startY0) % bufRows;
|
||||
rows[i] = ringBuf + bi*swidthcn;
|
||||
}
|
||||
}
|
||||
if( i < kheight )
|
||||
break;
|
||||
i -= kheight - 1;
|
||||
MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
#endif // CAROTENE_NEON
|
||||
|
||||
void erode(const Size2D &ssize, u32 cn,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
const Size2D &ksize,
|
||||
size_t anchorX, size_t anchorY,
|
||||
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
|
||||
const u8 * borderValues, Margin borderMargin)
|
||||
{
|
||||
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
|
||||
anchorX < ksize.width && anchorY < ksize.height);
|
||||
#ifdef CAROTENE_NEON
|
||||
morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
|
||||
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
|
||||
borderValues, borderMargin);
|
||||
#else
|
||||
(void)cn;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)rowBorderType;
|
||||
(void)columnBorderType;
|
||||
(void)borderValues;
|
||||
(void)borderMargin;
|
||||
#endif
|
||||
}
|
||||
|
||||
void dilate(const Size2D &ssize, u32 cn,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
const Size2D &ksize,
|
||||
size_t anchorX, size_t anchorY,
|
||||
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
|
||||
const u8 * borderValues, Margin borderMargin)
|
||||
{
|
||||
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
|
||||
anchorX < ksize.width && anchorY < ksize.height);
|
||||
#ifdef CAROTENE_NEON
|
||||
morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
|
||||
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
|
||||
borderValues, borderMargin);
|
||||
#else
|
||||
(void)cn;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)rowBorderType;
|
||||
(void)columnBorderType;
|
||||
(void)borderValues;
|
||||
(void)borderMargin;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
1572
3rdparty/carotene/src/mul.cpp
vendored
Normal file
1572
3rdparty/carotene/src/mul.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1310
3rdparty/carotene/src/norm.cpp
vendored
Normal file
1310
3rdparty/carotene/src/norm.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
539
3rdparty/carotene/src/opticalflow.cpp
vendored
Normal file
539
3rdparty/carotene/src/opticalflow.cpp
vendored
Normal file
@ -0,0 +1,539 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "saturate_cast.hpp"
|
||||
#include <vector>
|
||||
#include <float.h> // For FLT_EPSILON
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
|
||||
|
||||
/*
|
||||
* Pyramidal Lucas-Kanade Optical Flow level processing
|
||||
*/
|
||||
void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
|
||||
const u8 *prevData, ptrdiff_t prevStride,
|
||||
const s16 *prevDerivData, ptrdiff_t prevDerivStride,
|
||||
const u8 *nextData, ptrdiff_t nextStride,
|
||||
u32 ptCount,
|
||||
const f32 *prevPts, f32 *nextPts,
|
||||
u8 *status, f32 *err,
|
||||
const Size2D &winSize,
|
||||
u32 terminationCount, f64 terminationEpsilon,
|
||||
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
|
||||
f32 minEigThreshold)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;
|
||||
s32 cn2 = cn*2;
|
||||
|
||||
std::vector<s16> _buf(winSize.total()*(cn + cn2));
|
||||
s16* IWinBuf = &_buf[0];
|
||||
s32 IWinBufStride = winSize.width*cn;
|
||||
s16* derivIWinBuf = &_buf[winSize.total()*cn];
|
||||
s32 derivIWinBufStride = winSize.width*cn2;
|
||||
|
||||
for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
|
||||
{
|
||||
f32 levscale = (1./(1 << level));
|
||||
u32 ptref = ptidx << 1;
|
||||
f32 prevPtX = prevPts[ptref+0]*levscale;
|
||||
f32 prevPtY = prevPts[ptref+1]*levscale;
|
||||
f32 nextPtX;
|
||||
f32 nextPtY;
|
||||
if( level == maxLevel )
|
||||
{
|
||||
if( useInitialFlow )
|
||||
{
|
||||
nextPtX = nextPts[ptref+0]*levscale;
|
||||
nextPtY = nextPts[ptref+1]*levscale;
|
||||
}
|
||||
else
|
||||
{
|
||||
nextPtX = prevPtX;
|
||||
nextPtY = prevPtY;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
nextPtX = nextPts[ptref+0]*2.f;
|
||||
nextPtY = nextPts[ptref+1]*2.f;
|
||||
}
|
||||
nextPts[ptref+0] = nextPtX;
|
||||
nextPts[ptref+1] = nextPtY;
|
||||
|
||||
s32 iprevPtX, iprevPtY;
|
||||
s32 inextPtX, inextPtY;
|
||||
prevPtX -= halfWinX;
|
||||
prevPtY -= halfWinY;
|
||||
iprevPtX = floor(prevPtX);
|
||||
iprevPtY = floor(prevPtY);
|
||||
|
||||
if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
|
||||
iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
|
||||
{
|
||||
if( level == 0 )
|
||||
{
|
||||
if( status )
|
||||
status[ptidx] = false;
|
||||
if( err )
|
||||
err[ptidx] = 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
f32 a = prevPtX - iprevPtX;
|
||||
f32 b = prevPtY - iprevPtY;
|
||||
const s32 W_BITS = 14, W_BITS1 = 14;
|
||||
const f32 FLT_SCALE = 1.f/(1 << 20);
|
||||
s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
|
||||
s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));
|
||||
s32 iw10 = round((1.f - a)*b*(1 << W_BITS));
|
||||
s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
|
||||
|
||||
s32 dstep = prevDerivStride/sizeof(s16);
|
||||
f32 A11 = 0, A12 = 0, A22 = 0;
|
||||
|
||||
int16x4_t viw00 = vmov_n_s16((s16)iw00);
|
||||
int16x4_t viw01 = vmov_n_s16((s16)iw01);
|
||||
int16x4_t viw10 = vmov_n_s16((s16)iw10);
|
||||
int16x4_t viw11 = vmov_n_s16((s16)iw11);
|
||||
|
||||
float32x4_t vA11 = vmovq_n_f32(0);
|
||||
float32x4_t vA12 = vmovq_n_f32(0);
|
||||
float32x4_t vA22 = vmovq_n_f32(0);
|
||||
|
||||
s32 wwcn = winSize.width*cn;
|
||||
|
||||
// extract the patch from the first image, compute covariation matrix of derivatives
|
||||
s32 x = 0;
|
||||
for(s32 y = 0; y < (s32)winSize.height; y++ )
|
||||
{
|
||||
const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;
|
||||
const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;
|
||||
|
||||
s16* Iptr = IWinBuf + y*IWinBufStride;
|
||||
s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
|
||||
|
||||
internal::prefetch(src + x + prevStride * 2, 0);
|
||||
for(x = 0; x <= wwcn - 8; x += 8)
|
||||
{
|
||||
uint8x8_t vsrc00 = vld1_u8(src + x);
|
||||
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
|
||||
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
|
||||
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
|
||||
|
||||
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));
|
||||
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));
|
||||
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));
|
||||
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));
|
||||
|
||||
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
|
||||
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
|
||||
|
||||
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
|
||||
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
|
||||
|
||||
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
|
||||
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
|
||||
|
||||
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
|
||||
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
|
||||
|
||||
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
|
||||
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
|
||||
|
||||
vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));
|
||||
}
|
||||
for(; x <= wwcn - 4; x += 4)
|
||||
{
|
||||
uint8x8_t vsrc00 = vld1_u8(src + x);
|
||||
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
|
||||
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
|
||||
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
|
||||
|
||||
int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));
|
||||
int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));
|
||||
int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));
|
||||
int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));
|
||||
|
||||
int32x4_t vsuml1 = vmull_s16(vs00, viw00);
|
||||
int32x4_t vsuml2 = vmull_s16(vs01, viw01);
|
||||
vsuml1 = vmlal_s16(vsuml1, vs10, viw10);
|
||||
vsuml2 = vmlal_s16(vsuml2, vs11, viw11);
|
||||
int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);
|
||||
|
||||
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
|
||||
|
||||
vst1_s16(Iptr + x, vsumnl);
|
||||
}
|
||||
|
||||
internal::prefetch(dsrc + dstep * 2, 0);
|
||||
for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
|
||||
{
|
||||
#if __GNUC_MINOR__ < 0
|
||||
__asm__ (
|
||||
"vld2.16 {d0-d1}, [%[dsrc00]] \n\t"
|
||||
"vld2.16 {d2-d3}, [%[dsrc10]] \n\t"
|
||||
"vld2.16 {d4-d5}, [%[dsrc01]] \n\t"
|
||||
"vld2.16 {d6-d7}, [%[dsrc11]] \n\t"
|
||||
"vmull.s16 q4, d3, %P[viw10] \n\t"
|
||||
"vmull.s16 q5, d0, %P[viw00] \n\t"
|
||||
"vmlal.s16 q4, d7, %P[viw11] \n\t"
|
||||
"vmlal.s16 q5, d4, %P[viw01] \n\t"
|
||||
"vmlal.s16 q4, d1, %P[viw00] \n\t"
|
||||
"vmlal.s16 q5, d2, %P[viw10] \n\t"
|
||||
"vmlal.s16 q4, d5, %P[viw01] \n\t"
|
||||
"vmlal.s16 q5, d6, %P[viw11] \n\t"
|
||||
"vrshrn.s32 d13, q4, %[W_BITS1] \n\t"
|
||||
"vrshrn.s32 d12, q5, %[W_BITS1] \n\t"
|
||||
"vmull.s16 q3, d13, d13 \n\t"
|
||||
"vmull.s16 q4, d12, d12 \n\t"
|
||||
"vmull.s16 q5, d13, d12 \n\t"
|
||||
"vcvt.f32.s32 q3, q3 \n\t"
|
||||
"vcvt.f32.s32 q4, q4 \n\t"
|
||||
"vcvt.f32.s32 q5, q5 \n\t"
|
||||
"vadd.f32 %q[vA22], q3 \n\t"
|
||||
"vadd.f32 %q[vA11], q4 \n\t"
|
||||
"vadd.f32 %q[vA12], q5 \n\t"
|
||||
"vst2.16 {d12-d13}, [%[out]] \n\t"
|
||||
: [vA22] "=w" (vA22),
|
||||
[vA11] "=w" (vA11),
|
||||
[vA12] "=w" (vA12)
|
||||
: "0" (vA22),
|
||||
"1" (vA11),
|
||||
"2" (vA12),
|
||||
[out] "r" (dIptr),
|
||||
[dsrc00] "r" (dsrc),
|
||||
[dsrc10] "r" (dsrc + dstep),
|
||||
[dsrc01] "r" (dsrc + cn2),
|
||||
[dsrc11] "r" (dsrc + dstep + cn2),
|
||||
[viw00] "w" (viw00),
|
||||
[viw10] "w" (viw10),
|
||||
[viw01] "w" (viw01),
|
||||
[viw11] "w" (viw11),
|
||||
[W_BITS1] "I" (W_BITS1)
|
||||
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
|
||||
);
|
||||
#else
|
||||
int16x4x2_t vdsrc00 = vld2_s16(dsrc);
|
||||
int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);
|
||||
int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);
|
||||
int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);
|
||||
|
||||
int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);
|
||||
int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);
|
||||
|
||||
vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);
|
||||
vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);
|
||||
|
||||
vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);
|
||||
vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);
|
||||
|
||||
vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);
|
||||
vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);
|
||||
|
||||
int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);
|
||||
int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);
|
||||
|
||||
int32x4_t va22i = vmull_s16(vsumny, vsumny);
|
||||
int32x4_t va11i = vmull_s16(vsumnx, vsumnx);
|
||||
int32x4_t va12i = vmull_s16(vsumnx, vsumny);
|
||||
|
||||
float32x4_t va22f = vcvtq_f32_s32(va22i);
|
||||
float32x4_t va11f = vcvtq_f32_s32(va11i);
|
||||
float32x4_t va12f = vcvtq_f32_s32(va12i);
|
||||
|
||||
vA22 = vaddq_f32(vA22, va22f);
|
||||
vA11 = vaddq_f32(vA11, va11f);
|
||||
vA12 = vaddq_f32(vA12, va12f);
|
||||
|
||||
int16x4x2_t vsum;
|
||||
vsum.val[0] = vsumnx;
|
||||
vsum.val[1] = vsumny;
|
||||
vst2_s16(dIptr, vsum);
|
||||
#endif
|
||||
}
|
||||
|
||||
for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )
|
||||
{
|
||||
s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
|
||||
src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);
|
||||
s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
|
||||
dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
|
||||
s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
|
||||
dsrc[dstep+cn2+1]*iw11, W_BITS1);
|
||||
Iptr[x] = (s16)ival;
|
||||
dIptr[0] = (s16)ixval;
|
||||
dIptr[1] = (s16)iyval;
|
||||
|
||||
A11 += (f32)(ixval*ixval);
|
||||
A12 += (f32)(ixval*iyval);
|
||||
A22 += (f32)(iyval*iyval);
|
||||
}
|
||||
}
|
||||
|
||||
f32 A11buf[2], A12buf[2], A22buf[2];
|
||||
vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));
|
||||
vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));
|
||||
vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));
|
||||
A11 += A11buf[0] + A11buf[1];
|
||||
A12 += A12buf[0] + A12buf[1];
|
||||
A22 += A22buf[0] + A22buf[1];
|
||||
|
||||
A11 *= FLT_SCALE;
|
||||
A12 *= FLT_SCALE;
|
||||
A22 *= FLT_SCALE;
|
||||
|
||||
f32 D = A11*A22 - A12*A12;
|
||||
f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
|
||||
4.f*A12*A12))/(2*winSize.width*winSize.height);
|
||||
|
||||
if( err && getMinEigenVals )
|
||||
err[ptidx] = (f32)minEig;
|
||||
|
||||
if( minEig < minEigThreshold || D < FLT_EPSILON )
|
||||
{
|
||||
if( level == 0 && status )
|
||||
status[ptidx] = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
D = 1.f/D;
|
||||
|
||||
nextPtX -= halfWinX;
|
||||
nextPtY -= halfWinY;
|
||||
f32 prevDeltaX = 0;
|
||||
f32 prevDeltaY = 0;
|
||||
|
||||
for(u32 j = 0; j < terminationCount; j++ )
|
||||
{
|
||||
inextPtX = floor(nextPtX);
|
||||
inextPtY = floor(nextPtY);
|
||||
|
||||
if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
|
||||
inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
|
||||
{
|
||||
if( level == 0 && status )
|
||||
status[ptidx] = false;
|
||||
break;
|
||||
}
|
||||
|
||||
a = nextPtX - inextPtX;
|
||||
b = nextPtY - inextPtY;
|
||||
iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
|
||||
iw01 = round(a*(1.f - b)*(1 << W_BITS));
|
||||
iw10 = round((1.f - a)*b*(1 << W_BITS));
|
||||
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
|
||||
f32 b1 = 0, b2 = 0;
|
||||
|
||||
viw00 = vmov_n_s16((s16)iw00);
|
||||
viw01 = vmov_n_s16((s16)iw01);
|
||||
viw10 = vmov_n_s16((s16)iw10);
|
||||
viw11 = vmov_n_s16((s16)iw11);
|
||||
|
||||
float32x4_t vb1 = vmovq_n_f32(0);
|
||||
float32x4_t vb2 = vmovq_n_f32(0);
|
||||
|
||||
for(s32 y = 0; y < (s32)winSize.height; y++ )
|
||||
{
|
||||
const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;
|
||||
const s16* Iptr = IWinBuf + y*IWinBufStride;
|
||||
const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
|
||||
|
||||
x = 0;
|
||||
|
||||
internal::prefetch(Jptr, nextStride * 2);
|
||||
internal::prefetch(Iptr, IWinBufStride/2);
|
||||
internal::prefetch(dIptr, derivIWinBufStride/2);
|
||||
|
||||
for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )
|
||||
{
|
||||
uint8x8_t vj00 = vld1_u8(Jptr + x);
|
||||
uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);
|
||||
uint8x8_t vj01 = vld1_u8(Jptr + x + cn);
|
||||
uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);
|
||||
int16x8_t vI = vld1q_s16(Iptr + x);
|
||||
int16x8x2_t vDerivI = vld2q_s16(dIptr);
|
||||
|
||||
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));
|
||||
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));
|
||||
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));
|
||||
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));
|
||||
|
||||
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
|
||||
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
|
||||
|
||||
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
|
||||
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
|
||||
|
||||
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
|
||||
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
|
||||
|
||||
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
|
||||
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
|
||||
|
||||
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
|
||||
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
|
||||
|
||||
int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);
|
||||
|
||||
int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));
|
||||
int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));
|
||||
int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));
|
||||
int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));
|
||||
|
||||
float32x4_t vb1f = vcvtq_f32_s32(vb1i);
|
||||
float32x4_t vb2f = vcvtq_f32_s32(vb2i);
|
||||
|
||||
vb1 = vaddq_f32(vb1, vb1f);
|
||||
vb2 = vaddq_f32(vb2, vb2f);
|
||||
}
|
||||
|
||||
for( ; x < wwcn; x++, dIptr += 2 )
|
||||
{
|
||||
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
|
||||
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
|
||||
W_BITS1-5) - Iptr[x];
|
||||
b1 += (f32)(diff*dIptr[0]);
|
||||
b2 += (f32)(diff*dIptr[1]);
|
||||
}
|
||||
}
|
||||
|
||||
f32 bbuf[2];
|
||||
float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));
|
||||
vst1_f32(bbuf, vb);
|
||||
b1 += bbuf[0];
|
||||
b2 += bbuf[1];
|
||||
|
||||
b1 *= FLT_SCALE;
|
||||
b2 *= FLT_SCALE;
|
||||
|
||||
f32 deltaX = (f32)((A12*b2 - A22*b1) * D);
|
||||
f32 deltaY = (f32)((A12*b1 - A11*b2) * D);
|
||||
|
||||
nextPtX += deltaX;
|
||||
nextPtY += deltaY;
|
||||
nextPts[ptref+0] = nextPtX + halfWinX;
|
||||
nextPts[ptref+1] = nextPtY + halfWinY;
|
||||
|
||||
if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )
|
||||
break;
|
||||
|
||||
if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&
|
||||
std::abs(deltaY + prevDeltaY) < 0.01 )
|
||||
{
|
||||
nextPts[ptref+0] -= deltaX*0.5f;
|
||||
nextPts[ptref+1] -= deltaY*0.5f;
|
||||
break;
|
||||
}
|
||||
prevDeltaX = deltaX;
|
||||
prevDeltaY = deltaY;
|
||||
}
|
||||
|
||||
if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
|
||||
{
|
||||
f32 nextPointX = nextPts[ptref+0] - halfWinX;
|
||||
f32 nextPointY = nextPts[ptref+1] - halfWinY;
|
||||
|
||||
s32 inextPointX = floor(nextPointX);
|
||||
s32 inextPointY = floor(nextPointY);
|
||||
|
||||
if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||
|
||||
inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )
|
||||
{
|
||||
if( status )
|
||||
status[ptidx] = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
f32 aa = nextPointX - inextPointX;
|
||||
f32 bb = nextPointY - inextPointY;
|
||||
iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));
|
||||
iw01 = round(aa*(1.f - bb)*(1 << W_BITS));
|
||||
iw10 = round((1.f - aa)*bb*(1 << W_BITS));
|
||||
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
|
||||
f32 errval = 0.f;
|
||||
|
||||
for(s32 y = 0; y < (s32)winSize.height; y++ )
|
||||
{
|
||||
const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;
|
||||
const s16* Iptr = IWinBuf + y*IWinBufStride;
|
||||
|
||||
for( x = 0; x < wwcn; x++ )
|
||||
{
|
||||
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
|
||||
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
|
||||
W_BITS1-5) - Iptr[x];
|
||||
errval += std::abs((f32)diff);
|
||||
}
|
||||
}
|
||||
err[ptidx] = errval / (32*wwcn*winSize.height);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)cn;
|
||||
(void)prevData;
|
||||
(void)prevStride;
|
||||
(void)prevDerivData;
|
||||
(void)prevDerivStride;
|
||||
(void)nextData;
|
||||
(void)nextStride;
|
||||
(void)prevPts;
|
||||
(void)nextPts;
|
||||
(void)status;
|
||||
(void)err;
|
||||
(void)winSize;
|
||||
(void)terminationCount;
|
||||
(void)terminationEpsilon;
|
||||
(void)level;
|
||||
(void)maxLevel;
|
||||
(void)useInitialFlow;
|
||||
(void)getMinEigenVals;
|
||||
(void)minEigThreshold;
|
||||
(void)ptCount;
|
||||
#endif
|
||||
}
|
||||
|
||||
}//CAROTENE_NS
|
||||
|
274
3rdparty/carotene/src/phase.cpp
vendored
Normal file
274
3rdparty/carotene/src/phase.cpp
vendored
Normal file
@ -0,0 +1,274 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
#define FASTATAN2CONST(scale) \
|
||||
f32 P1((f32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \
|
||||
P3((f32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \
|
||||
P5((f32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \
|
||||
P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
|
||||
A_90((f32)(90.f * scale)), \
|
||||
A_180((f32)(180.f * scale)), \
|
||||
A_360((f32)(360.f * scale)); \
|
||||
float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
|
||||
_90(vdupq_n_f32(A_90)), \
|
||||
_180(vdupq_n_f32(A_180)), \
|
||||
_360(vdupq_n_f32(A_360)), \
|
||||
z(vdupq_n_f32(0.0f)), \
|
||||
p1(vdupq_n_f32(P1)), \
|
||||
p3(vdupq_n_f32(P3)), \
|
||||
p5(vdupq_n_f32(P5)), \
|
||||
p7(vdupq_n_f32(P7));
|
||||
|
||||
#define FASTATAN2SCALAR(y, x, a) \
|
||||
{ \
|
||||
f32 ax = std::abs(x), ay = std::abs(y); \
|
||||
f32 c, c2; \
|
||||
if (ax >= ay) \
|
||||
{ \
|
||||
c = ay / (ax + (float)DBL_EPSILON); \
|
||||
c2 = c * c; \
|
||||
a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
c = ax / (ay + (float)DBL_EPSILON); \
|
||||
c2 = c * c; \
|
||||
a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
|
||||
} \
|
||||
if (x < 0) \
|
||||
a = A_180 - a; \
|
||||
if (y < 0) \
|
||||
a = A_360 - a; \
|
||||
}
|
||||
|
||||
#define FASTATAN2VECTOR(v_y, v_x, a) \
|
||||
{ \
|
||||
float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
|
||||
float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
|
||||
float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
|
||||
float32x4_t c2 = vmulq_f32(c, c); \
|
||||
a = vmulq_f32(c2, p7); \
|
||||
\
|
||||
a = vmulq_f32(vaddq_f32(a, p5), c2); \
|
||||
a = vmulq_f32(vaddq_f32(a, p3), c2); \
|
||||
a = vmulq_f32(vaddq_f32(a, p1), c); \
|
||||
\
|
||||
a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
|
||||
a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
|
||||
a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
|
||||
\
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void phase(const Size2D &size,
|
||||
const s16 * src0Base, ptrdiff_t src0Stride,
|
||||
const s16 * src1Base, ptrdiff_t src1Stride,
|
||||
u8 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
FASTATAN2CONST(256.0f / 360.0f)
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
float32x4_t v_05 = vdupq_n_f32(0.5f);
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
|
||||
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
|
||||
|
||||
// 0
|
||||
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
|
||||
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
|
||||
float32x4_t v_dst32f0;
|
||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
|
||||
|
||||
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
|
||||
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
|
||||
float32x4_t v_dst32f1;
|
||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
||||
|
||||
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
||||
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
||||
|
||||
// 1
|
||||
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
|
||||
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
|
||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
|
||||
|
||||
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
|
||||
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
|
||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
||||
|
||||
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
||||
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
||||
|
||||
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
|
||||
vmovn_u16(v_dst16s1)));
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src0 = vld1q_s16(src0 + j);
|
||||
int16x8_t v_src1 = vld1q_s16(src1 + j);
|
||||
|
||||
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
|
||||
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
|
||||
float32x4_t v_dst32f0;
|
||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
|
||||
|
||||
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
|
||||
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
|
||||
float32x4_t v_dst32f1;
|
||||
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
||||
|
||||
uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
||||
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
||||
|
||||
vst1_u8(dst + j, vmovn_u16(v_dst));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
f32 x = src0[j], y = src1[j];
|
||||
f32 a;
|
||||
FASTATAN2SCALAR(y, x, a)
|
||||
dst[j] = (u8)(s32)floor(a + 0.5f);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void phase(const Size2D &size,
|
||||
const f32 * src0Base, ptrdiff_t src0Stride,
|
||||
const f32 * src1Base, ptrdiff_t src1Stride,
|
||||
f32 * dstBase, ptrdiff_t dstStride,
|
||||
f32 scale)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
FASTATAN2CONST(scale)
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
|
||||
float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
|
||||
float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
|
||||
|
||||
float32x4_t v_dst32f;
|
||||
// 0
|
||||
FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
|
||||
vst1q_f32(dst + j, v_dst32f);
|
||||
// 1
|
||||
FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
|
||||
vst1q_f32(dst + j + 4, v_dst32f);
|
||||
}
|
||||
if(j + 4 <= size.width)
|
||||
{
|
||||
float32x4_t v_src0 = vld1q_f32(src0 + j);
|
||||
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
||||
|
||||
float32x4_t v_dst32f;
|
||||
FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
|
||||
vst1q_f32(dst + j, v_dst32f);
|
||||
j += 4;
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
{
|
||||
f32 a;
|
||||
FASTATAN2SCALAR(src1[j], src0[j], a)
|
||||
dst[j] = a;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)scale;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
1414
3rdparty/carotene/src/pyramid.cpp
vendored
Normal file
1414
3rdparty/carotene/src/pyramid.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
460
3rdparty/carotene/src/reduce.cpp
vendored
Normal file
460
3rdparty/carotene/src/reduce.cpp
vendored
Normal file
@ -0,0 +1,460 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
void reduceColSum(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s32 * dstBase)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
memset(dstBase, 0, size.width*sizeof(s32));
|
||||
size_t i = 0;
|
||||
for (; i + 16 <= size.width; i += 16)
|
||||
{
|
||||
const u8* src_address = srcBase + i;
|
||||
|
||||
int32x4_t sll = vmovq_n_s32(0);
|
||||
int32x4_t slh = vmovq_n_s32(0);
|
||||
int32x4_t shl = vmovq_n_s32(0);
|
||||
int32x4_t shh = vmovq_n_s32(0);
|
||||
|
||||
for (size_t h = 0; h < size.height; h += 256)
|
||||
{
|
||||
size_t lim = std::min(h + 256, size.height);
|
||||
|
||||
uint16x8_t sl = vmovq_n_u16(0);
|
||||
uint16x8_t sh = vmovq_n_u16(0);
|
||||
|
||||
for (size_t k = h; k < lim; ++k, src_address += srcStride)
|
||||
{
|
||||
internal::prefetch(src_address + srcStride, 0);
|
||||
|
||||
uint8x16_t v = vld1q_u8(src_address);
|
||||
|
||||
sl = vaddw_u8(sl, vget_low_u8(v));
|
||||
sh = vaddw_u8(sh, vget_high_u8(v));
|
||||
}
|
||||
|
||||
int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
|
||||
int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
|
||||
int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
|
||||
int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));
|
||||
|
||||
sll = vqaddq_s32(sll, vsll);
|
||||
slh = vqaddq_s32(slh, vslh);
|
||||
shl = vqaddq_s32(shl, vshl);
|
||||
shh = vqaddq_s32(shh, vshh);
|
||||
}
|
||||
|
||||
vst1q_s32(dstBase + i + 0, sll);
|
||||
vst1q_s32(dstBase + i + 4, slh);
|
||||
vst1q_s32(dstBase + i + 8, shl);
|
||||
vst1q_s32(dstBase + i + 12, shh);
|
||||
}
|
||||
|
||||
for(size_t h = 0; h < size.height; ++h)
|
||||
{
|
||||
for(size_t j = i ; j < size.width; j++ )
|
||||
{
|
||||
if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
|
||||
dstBase[j] = 0x7fFFffFF;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
#endif
|
||||
}
|
||||
|
||||
void reduceColMax(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
memcpy(dstBase, srcBase, size.width);
|
||||
size_t i = 0;
|
||||
for (; i + 16*4 <= size.width; i += 16*4)
|
||||
{
|
||||
const u8* src_address = srcBase + i;
|
||||
|
||||
uint8x16_t s1 = vld1q_u8(src_address + 0);
|
||||
uint8x16_t s2 = vld1q_u8(src_address + 16);
|
||||
uint8x16_t s3 = vld1q_u8(src_address + 32);
|
||||
uint8x16_t s4 = vld1q_u8(src_address + 48);
|
||||
|
||||
src_address += srcStride;
|
||||
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
|
||||
{
|
||||
internal::prefetch(src_address + srcStride, 0);
|
||||
internal::prefetch(src_address + srcStride, 32);
|
||||
|
||||
uint8x16_t v1 = vld1q_u8(src_address + 0);
|
||||
uint8x16_t v2 = vld1q_u8(src_address + 16);
|
||||
uint8x16_t v3 = vld1q_u8(src_address + 32);
|
||||
uint8x16_t v4 = vld1q_u8(src_address + 48);
|
||||
|
||||
s1 = vmaxq_u8(s1, v1);
|
||||
s2 = vmaxq_u8(s2, v2);
|
||||
s3 = vmaxq_u8(s3, v3);
|
||||
s4 = vmaxq_u8(s4, v4);
|
||||
}
|
||||
|
||||
vst1q_u8(dstBase + i + 0, s1);
|
||||
vst1q_u8(dstBase + i + 16, s2);
|
||||
vst1q_u8(dstBase + i + 32, s3);
|
||||
vst1q_u8(dstBase + i + 48, s4);
|
||||
}
|
||||
|
||||
for (; i + 16 <= size.width; i += 16)
|
||||
{
|
||||
const u8* src_address = srcBase + i;
|
||||
uint8x16_t s1 = vld1q_u8(src_address);
|
||||
src_address += srcStride;
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
|
||||
{
|
||||
internal::prefetch(src_address + srcStride, 0);
|
||||
|
||||
uint8x16_t v1 = vld1q_u8(src_address);
|
||||
s1 = vmaxq_u8(s1, v1);
|
||||
}
|
||||
vst1q_u8(dstBase + i, s1);
|
||||
}
|
||||
|
||||
if (i < size.width)
|
||||
for(size_t h = 1; h < size.height; ++h)
|
||||
for(size_t j = i ; j < size.width; j++ )
|
||||
dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
#endif
|
||||
}
|
||||
|
||||
void reduceColMin(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u8 * dstBase)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
memcpy(dstBase, srcBase, size.width);
|
||||
size_t i = 0;
|
||||
for (; i + 16*4 <= size.width; i += 16*4)
|
||||
{
|
||||
const u8* src_address = srcBase + i;
|
||||
|
||||
uint8x16_t s1 = vld1q_u8(src_address + 0);
|
||||
uint8x16_t s2 = vld1q_u8(src_address + 16);
|
||||
uint8x16_t s3 = vld1q_u8(src_address + 32);
|
||||
uint8x16_t s4 = vld1q_u8(src_address + 48);
|
||||
|
||||
src_address += srcStride;
|
||||
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
|
||||
{
|
||||
internal::prefetch(src_address + srcStride, 0);
|
||||
internal::prefetch(src_address + srcStride, 32);
|
||||
|
||||
uint8x16_t v1 = vld1q_u8(src_address + 0);
|
||||
uint8x16_t v2 = vld1q_u8(src_address + 16);
|
||||
uint8x16_t v3 = vld1q_u8(src_address + 32);
|
||||
uint8x16_t v4 = vld1q_u8(src_address + 48);
|
||||
|
||||
s1 = vminq_u8(s1, v1);
|
||||
s2 = vminq_u8(s2, v2);
|
||||
s3 = vminq_u8(s3, v3);
|
||||
s4 = vminq_u8(s4, v4);
|
||||
}
|
||||
|
||||
vst1q_u8(dstBase + i + 0, s1);
|
||||
vst1q_u8(dstBase + i + 16, s2);
|
||||
vst1q_u8(dstBase + i + 32, s3);
|
||||
vst1q_u8(dstBase + i + 48, s4);
|
||||
}
|
||||
|
||||
for (; i + 16 <= size.width; i += 16)
|
||||
{
|
||||
const u8* src_address = srcBase + i;
|
||||
uint8x16_t s1 = vld1q_u8(src_address);
|
||||
src_address += srcStride;
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
|
||||
{
|
||||
internal::prefetch(src_address + srcStride, 0);
|
||||
|
||||
uint8x16_t v1 = vld1q_u8(src_address);
|
||||
s1 = vminq_u8(s1, v1);
|
||||
}
|
||||
vst1q_u8(dstBase + i, s1);
|
||||
}
|
||||
|
||||
if (i < size.width)
|
||||
for(size_t h = 1; h < size.height; ++h)
|
||||
for(size_t j = i ; j < size.width; j++ )
|
||||
dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
#endif
|
||||
}
|
||||
|
||||
void reduceColSum(const Size2D &size,
|
||||
const f32 * srcBase, ptrdiff_t srcStride,
|
||||
f32 * dstBase)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
memcpy(dstBase, srcBase, size.width*sizeof(f32));
|
||||
size_t srcstep = srcStride/sizeof(f32);
|
||||
size_t i = 0;
|
||||
for (; i + 16 <= size.width; i += 16)
|
||||
{
|
||||
const f32* src_address = srcBase + i;
|
||||
|
||||
float32x4_t s1 = vld1q_f32(src_address + 0);
|
||||
float32x4_t s2 = vld1q_f32(src_address + 4);
|
||||
float32x4_t s3 = vld1q_f32(src_address + 8);
|
||||
float32x4_t s4 = vld1q_f32(src_address + 12);
|
||||
|
||||
src_address += srcstep;
|
||||
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
|
||||
{
|
||||
internal::prefetch(src_address + srcstep, 0);
|
||||
internal::prefetch(src_address + srcstep, 32);
|
||||
|
||||
float32x4_t v1 = vld1q_f32(src_address + 0);
|
||||
float32x4_t v2 = vld1q_f32(src_address + 4);
|
||||
float32x4_t v3 = vld1q_f32(src_address + 8);
|
||||
float32x4_t v4 = vld1q_f32(src_address + 12);
|
||||
|
||||
s1 = vaddq_f32(s1, v1);
|
||||
s2 = vaddq_f32(s2, v2);
|
||||
s3 = vaddq_f32(s3, v3);
|
||||
s4 = vaddq_f32(s4, v4);
|
||||
}
|
||||
|
||||
vst1q_f32(dstBase + i + 0, s1);
|
||||
vst1q_f32(dstBase + i + 4, s2);
|
||||
vst1q_f32(dstBase + i + 8, s3);
|
||||
vst1q_f32(dstBase + i + 12, s4);
|
||||
}
|
||||
|
||||
for (; i + 4 <= size.width; i += 4)
|
||||
{
|
||||
const f32* src_address = srcBase + i;
|
||||
float32x4_t s1 = vld1q_f32(src_address);
|
||||
src_address += srcstep;
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
|
||||
{
|
||||
internal::prefetch(src_address + srcstep, 0);
|
||||
|
||||
float32x4_t v1 = vld1q_f32(src_address);
|
||||
s1 = vaddq_f32(s1, v1);
|
||||
}
|
||||
vst1q_f32(dstBase + i, s1);
|
||||
}
|
||||
|
||||
if (i < size.width)
|
||||
for(size_t h = 1; h < size.height; ++h)
|
||||
{
|
||||
for(size_t j = i ; j < size.width; j++ )
|
||||
{
|
||||
dstBase[j] += srcBase[j + srcstep * h];
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
#endif
|
||||
}
|
||||
|
||||
void reduceColMax(const Size2D &size,
|
||||
const f32 * srcBase, ptrdiff_t srcStride,
|
||||
f32 * dstBase)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
memcpy(dstBase, srcBase, size.width*sizeof(f32));
|
||||
size_t srcstep = srcStride/sizeof(f32);
|
||||
size_t i = 0;
|
||||
for (; i + 16 <= size.width; i += 16)
|
||||
{
|
||||
const f32* src_address = srcBase + i;
|
||||
|
||||
float32x4_t s1 = vld1q_f32(src_address + 0);
|
||||
float32x4_t s2 = vld1q_f32(src_address + 4);
|
||||
float32x4_t s3 = vld1q_f32(src_address + 8);
|
||||
float32x4_t s4 = vld1q_f32(src_address + 12);
|
||||
|
||||
src_address += srcstep;
|
||||
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
|
||||
{
|
||||
internal::prefetch(src_address + srcstep, 0);
|
||||
internal::prefetch(src_address + srcstep, 32);
|
||||
|
||||
float32x4_t v1 = vld1q_f32(src_address + 0);
|
||||
float32x4_t v2 = vld1q_f32(src_address + 4);
|
||||
float32x4_t v3 = vld1q_f32(src_address + 8);
|
||||
float32x4_t v4 = vld1q_f32(src_address + 12);
|
||||
|
||||
s1 = vmaxq_f32(s1, v1);
|
||||
s2 = vmaxq_f32(s2, v2);
|
||||
s3 = vmaxq_f32(s3, v3);
|
||||
s4 = vmaxq_f32(s4, v4);
|
||||
}
|
||||
|
||||
vst1q_f32(dstBase + i + 0, s1);
|
||||
vst1q_f32(dstBase + i + 4, s2);
|
||||
vst1q_f32(dstBase + i + 8, s3);
|
||||
vst1q_f32(dstBase + i + 12, s4);
|
||||
}
|
||||
|
||||
for (; i + 4 <= size.width; i += 4)
|
||||
{
|
||||
const f32* src_address = srcBase + i;
|
||||
float32x4_t s1 = vld1q_f32(src_address);
|
||||
src_address += srcstep;
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
|
||||
{
|
||||
internal::prefetch(src_address + srcstep, 0);
|
||||
|
||||
float32x4_t v1 = vld1q_f32(src_address);
|
||||
s1 = vmaxq_f32(s1, v1);
|
||||
}
|
||||
vst1q_f32(dstBase + i, s1);
|
||||
}
|
||||
|
||||
if (i < size.width)
|
||||
for(size_t h = 1; h < size.height; ++h)
|
||||
for(size_t j = i ; j < size.width; j++ )
|
||||
dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
#endif
|
||||
}
|
||||
|
||||
void reduceColMin(const Size2D &size,
|
||||
const f32 * srcBase, ptrdiff_t srcStride,
|
||||
f32 * dstBase)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
memcpy(dstBase, srcBase, size.width*sizeof(f32));
|
||||
size_t srcstep = srcStride/sizeof(f32);
|
||||
size_t i = 0;
|
||||
for (; i + 16 <= size.width; i += 16)
|
||||
{
|
||||
const f32* src_address = srcBase + i;
|
||||
|
||||
float32x4_t s1 = vld1q_f32(src_address + 0);
|
||||
float32x4_t s2 = vld1q_f32(src_address + 4);
|
||||
float32x4_t s3 = vld1q_f32(src_address + 8);
|
||||
float32x4_t s4 = vld1q_f32(src_address + 12);
|
||||
|
||||
src_address += srcstep;
|
||||
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
|
||||
{
|
||||
internal::prefetch(src_address + srcstep, 0);
|
||||
internal::prefetch(src_address + srcstep, 32);
|
||||
|
||||
float32x4_t v1 = vld1q_f32(src_address + 0);
|
||||
float32x4_t v2 = vld1q_f32(src_address + 4);
|
||||
float32x4_t v3 = vld1q_f32(src_address + 8);
|
||||
float32x4_t v4 = vld1q_f32(src_address + 12);
|
||||
|
||||
s1 = vminq_f32(s1, v1);
|
||||
s2 = vminq_f32(s2, v2);
|
||||
s3 = vminq_f32(s3, v3);
|
||||
s4 = vminq_f32(s4, v4);
|
||||
}
|
||||
|
||||
vst1q_f32(dstBase + i + 0, s1);
|
||||
vst1q_f32(dstBase + i + 4, s2);
|
||||
vst1q_f32(dstBase + i + 8, s3);
|
||||
vst1q_f32(dstBase + i + 12, s4);
|
||||
}
|
||||
|
||||
for (; i + 4 <= size.width; i += 4)
|
||||
{
|
||||
const f32* src_address = srcBase + i;
|
||||
float32x4_t s1 = vld1q_f32(src_address);
|
||||
src_address += srcstep;
|
||||
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
|
||||
{
|
||||
internal::prefetch(src_address + srcstep, 0);
|
||||
|
||||
float32x4_t v1 = vld1q_f32(src_address);
|
||||
s1 = vminq_f32(s1, v1);
|
||||
}
|
||||
vst1q_f32(dstBase + i, s1);
|
||||
}
|
||||
|
||||
if (i < size.width)
|
||||
for(size_t h = 1; h < size.height; ++h)
|
||||
for(size_t j = i ; j < size.width; j++ )
|
||||
dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
|
||||
#else
|
||||
(void)size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
694
3rdparty/carotene/src/remap.cpp
vendored
Normal file
694
3rdparty/carotene/src/remap.cpp
vendored
Normal file
@ -0,0 +1,694 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "remap.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace internal {
|
||||
|
||||
void remapNearestNeighborReplicate(const Size2D size,
|
||||
const u8 * srcBase,
|
||||
const s32 * map,
|
||||
u8 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
|
||||
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
for (size_t x = 0; x < size.width; ++x)
|
||||
{
|
||||
dst_row[x] = srcBase[map_row[x]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void remapNearestNeighborConst(const Size2D size,
|
||||
const u8 * srcBase,
|
||||
const s32 * map,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
u8 borderValue)
|
||||
{
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
|
||||
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
for (size_t x = 0; x < size.width; ++x)
|
||||
{
|
||||
s32 src_idx = map_row[x];
|
||||
dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void remapLinearReplicate(const Size2D size,
|
||||
const u8 * srcBase,
|
||||
const s32 * map,
|
||||
const f32 * coeffs,
|
||||
u8 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
int16x8_t v_zero16 = vdupq_n_s16(0);
|
||||
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
|
||||
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
|
||||
|
||||
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
size_t x = 0;
|
||||
for ( ; x + 8 < size.width; x += 8)
|
||||
{
|
||||
int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
|
||||
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
|
||||
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
|
||||
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
|
||||
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
|
||||
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
|
||||
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
|
||||
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
|
||||
|
||||
int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
|
||||
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
|
||||
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
|
||||
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
|
||||
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
|
||||
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
|
||||
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
|
||||
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
|
||||
|
||||
int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
|
||||
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
|
||||
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
|
||||
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
|
||||
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
|
||||
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
|
||||
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
|
||||
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
|
||||
|
||||
int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
|
||||
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
|
||||
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
|
||||
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
|
||||
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
|
||||
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
|
||||
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
|
||||
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
|
||||
|
||||
// first part
|
||||
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
|
||||
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
|
||||
|
||||
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
|
||||
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
|
||||
vget_low_s16(v_src00))), v_coeff.val[0]);
|
||||
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
|
||||
vget_low_s16(v_src10))), v_coeff.val[0]);
|
||||
|
||||
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
|
||||
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
|
||||
|
||||
// second part
|
||||
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
|
||||
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
|
||||
|
||||
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
|
||||
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
|
||||
vget_high_s16(v_src00))), v_coeff.val[0]);
|
||||
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
|
||||
vget_high_s16(v_src10))), v_coeff.val[0]);
|
||||
|
||||
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
|
||||
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
|
||||
|
||||
// store
|
||||
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
|
||||
}
|
||||
|
||||
for ( ; x < size.width; ++x)
|
||||
{
|
||||
s32 src00_index = map_row[(x << 2)];
|
||||
s32 src10_index = map_row[(x << 2) + 2];
|
||||
f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
|
||||
srcBase[src00_index];
|
||||
f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
|
||||
srcBase[src10_index];
|
||||
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void remapLinearConst(const Size2D size,
|
||||
const u8 * srcBase,
|
||||
const s32 * map,
|
||||
const f32 * coeffs,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
u8 borderValue)
|
||||
{
|
||||
int16x8_t v_zero16 = vdupq_n_s16(0);
|
||||
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
|
||||
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
|
||||
|
||||
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
size_t x = 0;
|
||||
for ( ; x + 8 < size.width; x += 8)
|
||||
{
|
||||
int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
|
||||
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1);
|
||||
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2);
|
||||
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
|
||||
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
|
||||
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
|
||||
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
|
||||
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
|
||||
|
||||
int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
|
||||
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1);
|
||||
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2);
|
||||
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
|
||||
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
|
||||
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
|
||||
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
|
||||
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
|
||||
|
||||
int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
|
||||
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1);
|
||||
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
|
||||
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
|
||||
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
|
||||
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
|
||||
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
|
||||
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
|
||||
|
||||
int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
|
||||
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1);
|
||||
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
|
||||
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
|
||||
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
|
||||
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
|
||||
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
|
||||
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
|
||||
|
||||
// first part
|
||||
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
|
||||
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
|
||||
|
||||
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
|
||||
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
|
||||
vget_low_s16(v_src00))), v_coeff.val[0]);
|
||||
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
|
||||
vget_low_s16(v_src10))), v_coeff.val[0]);
|
||||
|
||||
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
|
||||
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
|
||||
|
||||
// second part
|
||||
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
|
||||
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
|
||||
|
||||
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
|
||||
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
|
||||
vget_high_s16(v_src00))), v_coeff.val[0]);
|
||||
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
|
||||
vget_high_s16(v_src10))), v_coeff.val[0]);
|
||||
|
||||
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
|
||||
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
|
||||
|
||||
// store
|
||||
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
|
||||
}
|
||||
|
||||
for ( ; x < size.width; ++x)
|
||||
{
|
||||
s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
|
||||
s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
|
||||
s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
|
||||
s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
|
||||
|
||||
f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
|
||||
f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
|
||||
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
#endif // CAROTENE_NEON
|
||||
|
||||
bool isRemapNearestNeighborSupported(const Size2D &ssize)
|
||||
{
|
||||
#if SIZE_MAX > UINT32_MAX
|
||||
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
|
||||
// is performed with u32
|
||||
isSupportedConfiguration();
|
||||
#else
|
||||
(void)ssize;
|
||||
return isSupportedConfiguration();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool isRemapLinearSupported(const Size2D &ssize)
|
||||
{
|
||||
#if SIZE_MAX > UINT32_MAX
|
||||
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
|
||||
// is performed with u32
|
||||
isSupportedConfiguration();
|
||||
#else
|
||||
(void)ssize;
|
||||
return isSupportedConfiguration();
|
||||
#endif
|
||||
}
|
||||
|
||||
void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
const f32 * tableBase, ptrdiff_t tableStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE borderMode, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
|
||||
#ifdef CAROTENE_NEON
|
||||
using namespace internal;
|
||||
|
||||
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
|
||||
s32 * map = alignPtr(_map, 16);
|
||||
|
||||
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
|
||||
int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
|
||||
int32x4_t v_step4 = vdupq_n_s32(srcStride);
|
||||
int32x2_t v_step2 = vdup_n_s32(srcStride);
|
||||
|
||||
if (borderMode == BORDER_MODE_REPLICATE)
|
||||
{
|
||||
int32x4_t v_zero4 = vdupq_n_s32(0);
|
||||
int32x2_t v_zero2 = vdup_n_s32(0);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
|
||||
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
|
||||
|
||||
size_t x = 0;
|
||||
for ( ; x + 8 <= blockWidth; x += 8)
|
||||
{
|
||||
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
|
||||
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
|
||||
|
||||
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
|
||||
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
|
||||
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
|
||||
vst1q_s32(map_row + x, v_dst_index);
|
||||
|
||||
v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
|
||||
v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
|
||||
v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
|
||||
vst1q_s32(map_row + x + 4, v_dst_index);
|
||||
}
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
|
||||
|
||||
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
|
||||
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
|
||||
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
|
||||
vst1q_s32(map_row + x, v_dst_index);
|
||||
}
|
||||
|
||||
for ( ; x + 2 <= blockWidth; x += 2)
|
||||
{
|
||||
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
|
||||
|
||||
int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
|
||||
int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
|
||||
int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
|
||||
vst1_s32(map_row + x, v_dst_index);
|
||||
}
|
||||
|
||||
for ( ; x < blockWidth; ++x)
|
||||
{
|
||||
s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
|
||||
s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
|
||||
map_row[x] = src_y * srcStride + src_x;
|
||||
}
|
||||
}
|
||||
|
||||
// make remap
|
||||
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (borderMode == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
int32x4_t v_m1_4 = vdupq_n_s32(-1);
|
||||
int32x2_t v_m1_2 = vdup_n_s32(-1);
|
||||
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
|
||||
float32x2_t v_zero2 = vdup_n_f32(0.0f);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
|
||||
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
|
||||
|
||||
size_t x = 0;
|
||||
for ( ; x + 8 <= blockWidth; x += 8)
|
||||
{
|
||||
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
|
||||
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
|
||||
|
||||
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
|
||||
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
|
||||
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
|
||||
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
|
||||
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
|
||||
vst1q_s32(map_row + x, v_dst_index);
|
||||
|
||||
v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
|
||||
v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
|
||||
v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
|
||||
vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
|
||||
v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
|
||||
vst1q_s32(map_row + x + 4, v_dst_index);
|
||||
}
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
|
||||
|
||||
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
|
||||
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
|
||||
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
|
||||
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
|
||||
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
|
||||
vst1q_s32(map_row + x, v_dst_index);
|
||||
}
|
||||
|
||||
for ( ; x + 2 <= blockWidth; x += 2)
|
||||
{
|
||||
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
|
||||
|
||||
int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
|
||||
int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
|
||||
uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
|
||||
vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
|
||||
int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
|
||||
vst1_s32(map_row + x, v_dst_index);
|
||||
}
|
||||
|
||||
for ( ; x < blockWidth; ++x)
|
||||
{
|
||||
s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
|
||||
s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
|
||||
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
|
||||
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
|
||||
}
|
||||
}
|
||||
|
||||
// make remap
|
||||
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
(void)ssize;
|
||||
(void)dsize;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)tableBase;
|
||||
(void)tableStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderMode;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
void remapLinear(const Size2D &ssize, const Size2D &dsize,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
const f32 * tableBase, ptrdiff_t tableStride,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE borderMode, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
|
||||
#ifdef CAROTENE_NEON
|
||||
using namespace internal;
|
||||
|
||||
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
|
||||
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
|
||||
|
||||
s32 * map = alignPtr(_map, 16);
|
||||
f32 * coeffs = alignPtr(_coeffs, 16);
|
||||
|
||||
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
|
||||
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
|
||||
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
|
||||
|
||||
if (borderMode == BORDER_MODE_REPLICATE)
|
||||
{
|
||||
int32x4_t v_zero4 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
|
||||
|
||||
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
|
||||
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
|
||||
|
||||
size_t x = 0;
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
|
||||
|
||||
int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
|
||||
int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
|
||||
|
||||
float32x4x2_t v_coeff;
|
||||
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
|
||||
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
|
||||
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
|
||||
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
|
||||
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
|
||||
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
|
||||
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
|
||||
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
|
||||
|
||||
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
|
||||
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
|
||||
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
|
||||
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
|
||||
|
||||
int32x4x4_t v_dst_index;
|
||||
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
|
||||
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
|
||||
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
|
||||
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
|
||||
|
||||
vst2q_f32(coeff_row + (x << 1), v_coeff);
|
||||
vst4q_s32(map_row + (x << 2), v_dst_index);
|
||||
}
|
||||
|
||||
for ( ; x < blockWidth; ++x)
|
||||
{
|
||||
f32 src_x_f = table_row[(x << 1) + 0];
|
||||
f32 src_y_f = table_row[(x << 1) + 1];
|
||||
|
||||
s32 src0_x = (s32)floorf(src_x_f);
|
||||
s32 src0_y = (s32)floorf(src_y_f);
|
||||
|
||||
coeff_row[x << 1] = src_x_f - src0_x;
|
||||
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
|
||||
|
||||
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
|
||||
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
|
||||
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
|
||||
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
|
||||
|
||||
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
|
||||
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
|
||||
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
|
||||
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
|
||||
}
|
||||
}
|
||||
|
||||
remapLinearReplicate(Size2D(blockWidth, blockHeight),
|
||||
srcBase, &map[0], &coeffs[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (borderMode == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
|
||||
int32x4_t v_m1_4 = vdupq_n_s32(-1);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
|
||||
|
||||
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
|
||||
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
|
||||
|
||||
size_t x = 0;
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
|
||||
|
||||
int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
|
||||
int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
|
||||
|
||||
float32x4x2_t v_coeff;
|
||||
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
|
||||
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
|
||||
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
|
||||
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
|
||||
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
|
||||
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
|
||||
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
|
||||
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
|
||||
|
||||
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
|
||||
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
|
||||
|
||||
int32x4x4_t v_dst_index;
|
||||
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
|
||||
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
|
||||
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
|
||||
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
|
||||
|
||||
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
|
||||
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
|
||||
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
|
||||
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
|
||||
|
||||
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
|
||||
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
|
||||
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
|
||||
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
|
||||
|
||||
vst2q_f32(coeff_row + (x << 1), v_coeff);
|
||||
vst4q_s32(map_row + (x << 2), v_dst_index);
|
||||
}
|
||||
|
||||
for ( ; x < blockWidth; ++x)
|
||||
{
|
||||
f32 src_x_f = table_row[(x << 1) + 0];
|
||||
f32 src_y_f = table_row[(x << 1) + 1];
|
||||
|
||||
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
|
||||
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
|
||||
|
||||
coeff_row[(x << 1)] = src_x_f - src0_x;
|
||||
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
|
||||
|
||||
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
|
||||
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
|
||||
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
|
||||
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
|
||||
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
|
||||
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
|
||||
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
|
||||
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
|
||||
}
|
||||
}
|
||||
|
||||
remapLinearConst(Size2D(blockWidth, blockHeight),
|
||||
srcBase, &map[0], &coeffs[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)ssize;
|
||||
(void)dsize;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)tableBase;
|
||||
(void)tableStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderMode;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
85
3rdparty/carotene/src/remap.hpp
vendored
Normal file
85
3rdparty/carotene/src/remap.hpp
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#ifndef CAROTENE_SRC_REMAP_HPP
|
||||
#define CAROTENE_SRC_REMAP_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace CAROTENE_NS { namespace internal {
|
||||
|
||||
enum
|
||||
{
|
||||
BLOCK_SIZE = 32
|
||||
};
|
||||
|
||||
|
||||
void remapNearestNeighborReplicate(const Size2D size,
|
||||
const u8 * srcBase,
|
||||
const s32 * map,
|
||||
u8 * dstBase, ptrdiff_t dstStride);
|
||||
|
||||
void remapNearestNeighborConst(const Size2D size,
|
||||
const u8 * srcBase,
|
||||
const s32 * map,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
u8 borderValue);
|
||||
|
||||
void remapLinearReplicate(const Size2D size,
|
||||
const u8 * srcBase,
|
||||
const s32 * map,
|
||||
const f32 * coeffs,
|
||||
u8 * dstBase, ptrdiff_t dstStride);
|
||||
|
||||
void remapLinearConst(const Size2D size,
|
||||
const u8 * srcBase,
|
||||
const s32 * map,
|
||||
const f32 * coeffs,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
u8 borderValue);
|
||||
|
||||
} }
|
||||
|
||||
#endif // CAROTENE_NEON
|
||||
|
||||
#endif // CAROTENE_SRC_REMAP_HPP
|
2191
3rdparty/carotene/src/resize.cpp
vendored
Normal file
2191
3rdparty/carotene/src/resize.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
199
3rdparty/carotene/src/saturate_cast.hpp
vendored
Normal file
199
3rdparty/carotene/src/saturate_cast.hpp
vendored
Normal file
@ -0,0 +1,199 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#ifndef CAROTENE_SATURATE_CAST_HPP
|
||||
#define CAROTENE_SATURATE_CAST_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
|
||||
#if defined _MSC_VER && defined _M_ARM
|
||||
# include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <carotene/definitions.hpp>
|
||||
#include <carotene/types.hpp>
|
||||
|
||||
namespace CAROTENE_NS { namespace internal {
|
||||
|
||||
#if defined _MSC_VER && defined _M_ARM
|
||||
|
||||
__declspec(naked) static void vcvtr_s32_f64_imp(f64 d)
|
||||
{
|
||||
(void)d;
|
||||
__emit(0xEEBD); // vcvtr.s32.f64 s0, d0
|
||||
__emit(0x0B40);
|
||||
__emit(0xEE10); // vmov r0, s0
|
||||
__emit(0x0A10);
|
||||
__emit(0x4770); // bx lr
|
||||
}
|
||||
|
||||
# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x);
|
||||
# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x);
|
||||
|
||||
#elif defined CV_ICC || defined __GNUC__
|
||||
|
||||
# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__)
|
||||
# define CAROTENE_ROUND_FLT(value) { \
|
||||
register union { f32 f; s32 i; } result; \
|
||||
asm ("ftosis %0, %1 \n" : "=w" (result.f) : "w" (value) ); \
|
||||
return result.i; }
|
||||
# define CAROTENE_ROUND_DBL(value) { \
|
||||
register union {f32 f; s32 i;} __tegra_result; \
|
||||
asm ( \
|
||||
"ftosid %0, %P1\n" \
|
||||
: "=w" (__tegra_result.f) \
|
||||
: "w" (value) \
|
||||
); \
|
||||
return __tegra_result.i; \
|
||||
}
|
||||
# else
|
||||
# define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value);
|
||||
# define CAROTENE_ROUND_DBL(value) return (s32)lrint(value);
|
||||
# endif
|
||||
|
||||
#endif
|
||||
|
||||
inline s32 round(f32 value)
|
||||
{
|
||||
#ifdef CAROTENE_ROUND_FLT
|
||||
CAROTENE_ROUND_FLT(value)
|
||||
#else
|
||||
s32 intpart = (s32)(value);
|
||||
f32 fractpart = value - intpart;
|
||||
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
|
||||
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
|
||||
else
|
||||
return intpart;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline s32 round(f64 value)
|
||||
{
|
||||
#ifdef CAROTENE_ROUND_DBL
|
||||
CAROTENE_ROUND_DBL(value)
|
||||
#else
|
||||
s32 intpart = (s32)(value);
|
||||
f64 fractpart = value - intpart;
|
||||
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
|
||||
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
|
||||
else
|
||||
return intpart;
|
||||
#endif
|
||||
}
|
||||
/////////////// saturate_cast (used in image & signal processing) ///////////////////
|
||||
|
||||
template<typename _Tp> inline _Tp saturate_cast(u8 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(s8 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(u16 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(s16 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(u32 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(s32 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(s64 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(u64 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(f32 v) { return _Tp(v); }
|
||||
template<typename _Tp> inline _Tp saturate_cast(f64 v) { return _Tp(v); }
|
||||
|
||||
template<> inline u8 saturate_cast<u8>(s8 v) { return (u8)std::max((s32)v, 0); }
|
||||
template<> inline u8 saturate_cast<u8>(u16 v) { return (u8)std::min((u32)v, (u32)UCHAR_MAX); }
|
||||
template<> inline u8 saturate_cast<u8>(s32 v) { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
|
||||
template<> inline u8 saturate_cast<u8>(s16 v) { return saturate_cast<u8>((s32)v); }
|
||||
template<> inline u8 saturate_cast<u8>(u32 v) { return (u8)std::min(v, (u32)UCHAR_MAX); }
|
||||
template<> inline u8 saturate_cast<u8>(s64 v) { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
|
||||
template<> inline u8 saturate_cast<u8>(u64 v) { return (u8)std::min(v, (u64)UCHAR_MAX); }
|
||||
template<> inline u8 saturate_cast<u8>(f32 v) { return saturate_cast<u8>(round(v)); }
|
||||
template<> inline u8 saturate_cast<u8>(f64 v) { return saturate_cast<u8>(round(v)); }
|
||||
|
||||
template<> inline s8 saturate_cast<s8>(u8 v) { return (s8)std::min((s32)v, SCHAR_MAX); }
|
||||
template<> inline s8 saturate_cast<s8>(u16 v) { return (s8)std::min((u32)v, (u32)SCHAR_MAX); }
|
||||
template<> inline s8 saturate_cast<s8>(s32 v) { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
|
||||
template<> inline s8 saturate_cast<s8>(s16 v) { return saturate_cast<s8>((s32)v); }
|
||||
template<> inline s8 saturate_cast<s8>(u32 v) { return (s8)std::min(v, (u32)SCHAR_MAX); }
|
||||
template<> inline s8 saturate_cast<s8>(s64 v) { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
|
||||
template<> inline s8 saturate_cast<s8>(u64 v) { return (s8)std::min(v, (u64)SCHAR_MAX); }
|
||||
template<> inline s8 saturate_cast<s8>(f32 v) { return saturate_cast<s8>(round(v)); }
|
||||
template<> inline s8 saturate_cast<s8>(f64 v) { return saturate_cast<s8>(round(v)); }
|
||||
|
||||
template<> inline u16 saturate_cast<u16>(s8 v) { return (u16)std::max((s32)v, 0); }
|
||||
template<> inline u16 saturate_cast<u16>(s16 v) { return (u16)std::max((s32)v, 0); }
|
||||
template<> inline u16 saturate_cast<u16>(s32 v) { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
|
||||
template<> inline u16 saturate_cast<u16>(u32 v) { return (u16)std::min(v, (u32)USHRT_MAX); }
|
||||
template<> inline u16 saturate_cast<u16>(s64 v) { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
|
||||
template<> inline u16 saturate_cast<u16>(u64 v) { return (u16)std::min(v, (u64)USHRT_MAX); }
|
||||
template<> inline u16 saturate_cast<u16>(f32 v) { return saturate_cast<u16>(round(v)); }
|
||||
template<> inline u16 saturate_cast<u16>(f64 v) { return saturate_cast<u16>(round(v)); }
|
||||
|
||||
template<> inline s16 saturate_cast<s16>(u16 v) { return (s16)std::min((s32)v, SHRT_MAX); }
|
||||
template<> inline s16 saturate_cast<s16>(s32 v) { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
|
||||
template<> inline s16 saturate_cast<s16>(u32 v) { return (s16)std::min(v, (u32)SHRT_MAX); }
|
||||
template<> inline s16 saturate_cast<s16>(s64 v) { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
|
||||
template<> inline s16 saturate_cast<s16>(u64 v) { return (s16)std::min(v, (u64)SHRT_MAX); }
|
||||
template<> inline s16 saturate_cast<s16>(f32 v) { return saturate_cast<s16>(round(v)); }
|
||||
template<> inline s16 saturate_cast<s16>(f64 v) { return saturate_cast<s16>(round(v)); }
|
||||
|
||||
template<> inline u32 saturate_cast<u32>(s8 v) { return (u32)std::max(v, (s8)0); }
|
||||
template<> inline u32 saturate_cast<u32>(s16 v) { return (u32)std::max(v, (s16)0); }
|
||||
template<> inline u32 saturate_cast<u32>(s32 v) { return (u32)std::max(v, (s32)0); }
|
||||
template<> inline u32 saturate_cast<u32>(s64 v) { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
|
||||
template<> inline u32 saturate_cast<u32>(u64 v) { return (u32)std::min(v, (u64)UINT_MAX); }
|
||||
//OpenCV like f32/f64 -> u32 conversion
|
||||
//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
|
||||
template<> inline u32 saturate_cast<u32>(f32 v) { return round(v); }
|
||||
template<> inline u32 saturate_cast<u32>(f64 v) { return round(v); }
|
||||
//Negative clipping implementation
|
||||
//template<> inline u32 saturate_cast<u32>(f32 v) { return saturate_cast<u32>(round(v)); }
|
||||
//template<> inline u32 saturate_cast<u32>(f64 v) { return saturate_cast<u32>(round(v)); }
|
||||
|
||||
template<> inline s32 saturate_cast<s32>(u32 v) { return (s32)std::min(v, (u32)INT_MAX); }
|
||||
template<> inline s32 saturate_cast<s32>(s64 v) { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
|
||||
template<> inline s32 saturate_cast<s32>(u64 v) { return (s32)std::min(v, (u64)INT_MAX); }
|
||||
template<> inline s32 saturate_cast<s32>(f32 v) { return round(v); }
|
||||
template<> inline s32 saturate_cast<s32>(f64 v) { return round(v); }
|
||||
|
||||
template<> inline u64 saturate_cast<u64>(s8 v) { return (u64)std::max(v, (s8)0); }
|
||||
template<> inline u64 saturate_cast<u64>(s16 v) { return (u64)std::max(v, (s16)0); }
|
||||
template<> inline u64 saturate_cast<u64>(s32 v) { return (u64)std::max(v, (s32)0); }
|
||||
template<> inline u64 saturate_cast<u64>(s64 v) { return (u64)std::max(v, (s64)0); }
|
||||
|
||||
template<> inline s64 saturate_cast<s64>(u64 v) { return (s64)std::min(v, (u64)LLONG_MAX); }
|
||||
|
||||
} }
|
||||
|
||||
#endif
|
219
3rdparty/carotene/src/scharr.cpp
vendored
Normal file
219
3rdparty/carotene/src/scharr.cpp
vendored
Normal file
@ -0,0 +1,219 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
|
||||
{
|
||||
return (dx == 0 && dy == 1 &&
|
||||
isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
|
||||
(dx == 1 && dy == 0 &&
|
||||
isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
|
||||
}
|
||||
|
||||
void Scharr3x3(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
s32 dx, s32 dy,
|
||||
BORDER_MODE border, u8 borderValue, Margin borderMargin)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
|
||||
#ifdef CAROTENE_NEON
|
||||
static s16 dw[] = {3, 10, 3};
|
||||
|
||||
if (dy == 1)
|
||||
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
|
||||
3, 1, dw, 0,
|
||||
border, borderValue, borderMargin);
|
||||
else
|
||||
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
|
||||
1, 3, 0, dw,
|
||||
border, borderValue, borderMargin);
|
||||
#else
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ScharrDeriv(const Size2D &size, s32 cn,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t colsn = size.width*cn;
|
||||
size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
|
||||
|
||||
ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
|
||||
std::vector<s16> _tempBuf((delta << 1) + 64);
|
||||
s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
|
||||
|
||||
int16x8_t vc3 = vmovq_n_s16(3);
|
||||
int16x8_t vc10 = vmovq_n_s16(10);
|
||||
uint8x8_t v8c10 = vmov_n_u8(10);
|
||||
|
||||
for(size_t y = 0; y < size.height; y++ )
|
||||
{
|
||||
const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
|
||||
const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
|
||||
const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
|
||||
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
|
||||
|
||||
// do vertical convolution
|
||||
size_t x = 0;
|
||||
for( ; x < roiw8; x += 8 )
|
||||
{
|
||||
internal::prefetch(srow0 + x);
|
||||
internal::prefetch(srow1 + x);
|
||||
internal::prefetch(srow2 + x);
|
||||
#if __GNUC_MINOR__ < 7
|
||||
__asm__ (
|
||||
"vld1.8 {d0}, [%[src0]] \n\t"
|
||||
"vld1.8 {d2}, [%[src2]] \n\t"
|
||||
"vld1.8 {d1}, [%[src1]] \n\t"
|
||||
"vaddl.u8 q2, d2, d0 \n\t"
|
||||
"vmull.u8 q3, d1, %[vc10] \n\t"
|
||||
"vsubl.u8 q4, d2, d0 \n\t"
|
||||
"vmla.s16 q3, q2, %q[vc3] \n\t"
|
||||
"vst1.16 {d8-d9}, [%[out1],:128] \n\t"
|
||||
"vst1.16 {d6-d7}, [%[out0],:128] \n\t"
|
||||
:
|
||||
: [out0] "r" (trow0 + x),
|
||||
[out1] "r" (trow1 + x),
|
||||
[src0] "r" (srow0 + x),
|
||||
[src1] "r" (srow1 + x),
|
||||
[src2] "r" (srow2 + x),
|
||||
[vc10] "w" (v8c10), [vc3] "w" (vc3)
|
||||
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
|
||||
);
|
||||
#else
|
||||
uint8x8_t s0 = vld1_u8(srow0 + x);
|
||||
uint8x8_t s1 = vld1_u8(srow1 + x);
|
||||
uint8x8_t s2 = vld1_u8(srow2 + x);
|
||||
|
||||
int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
|
||||
int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
|
||||
int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
|
||||
int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
|
||||
|
||||
vst1q_s16(trow1 + x, t1);
|
||||
vst1q_s16(trow0 + x, t0);
|
||||
#endif
|
||||
}
|
||||
for( ; x < colsn; x++ )
|
||||
{
|
||||
trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
|
||||
trow1[x] = (s16)(srow2[x] - srow0[x]);
|
||||
}
|
||||
|
||||
// make border
|
||||
size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
|
||||
for( s32 k = 0; k < cn; k++ )
|
||||
{
|
||||
trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
|
||||
trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
|
||||
}
|
||||
|
||||
// do horizontal convolution, interleave the results and store them to dst
|
||||
x = 0;
|
||||
for( ; x < roiw8; x += 8 )
|
||||
{
|
||||
#if __GNUC_MINOR__ < 6
|
||||
__asm__ (
|
||||
"vld1.16 {d4-d5}, [%[s2ptr]] \n\t"
|
||||
"vld1.16 {d8-d9}, [%[s4ptr]] \n\t"
|
||||
"vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t"
|
||||
"vld1.16 {d0-d1}, [%[s0ptr]] \n\t"
|
||||
"vld1.16 {d2-d3}, [%[s1ptr]] \n\t"
|
||||
"vadd.i16 q7, q2, q4 \n\t"
|
||||
"vmul.s16 q6, q3, %q[vc10] \n\t"
|
||||
"vsub.s16 q5, q1, q0 \n\t"
|
||||
"vmla.s16 q6, q7, %q[vc3] \n\t"
|
||||
"vst2.16 {d10-d13}, [%[out]] \n\t"
|
||||
:
|
||||
: [out] "r" (drow + x * 2),
|
||||
[s0ptr] "r" (trow0 + x - cn),
|
||||
[s1ptr] "r" (trow0 + x + cn),
|
||||
[s2ptr] "r" (trow1 + x - cn),
|
||||
[s3ptr] "r" (trow1 + x),
|
||||
[s4ptr] "r" (trow1 + x + cn),
|
||||
[vc10] "w" (vc10), [vc3] "w" (vc3)
|
||||
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
|
||||
);
|
||||
#else
|
||||
int16x8_t s0 = vld1q_s16(trow0 + x - cn);
|
||||
int16x8_t s1 = vld1q_s16(trow0 + x + cn);
|
||||
int16x8_t s2 = vld1q_s16(trow1 + x - cn);
|
||||
int16x8_t s3 = vld1q_s16(trow1 + x);
|
||||
int16x8_t s4 = vld1q_s16(trow1 + x + cn);
|
||||
|
||||
int16x8_t s3x10 = vmulq_s16(s3, vc10);
|
||||
int16x8_t s24 = vaddq_s16(s2, s4);
|
||||
|
||||
int16x8x2_t vr;
|
||||
vr.val[0] = vsubq_s16(s1, s0);
|
||||
vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
|
||||
|
||||
vst2q_s16(drow + x*2, vr);
|
||||
#endif //__GNUC_MINOR__ < 6
|
||||
}
|
||||
for( ; x < colsn; x++ )
|
||||
{
|
||||
drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
|
||||
drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)cn;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
109
3rdparty/carotene/src/separable_filter.cpp
vendored
Normal file
109
3rdparty/carotene/src/separable_filter.cpp
vendored
Normal file
@ -0,0 +1,109 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "separable_filter.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
|
||||
{
|
||||
return isSupportedConfiguration() &&
|
||||
size.width >= 9 && size.height >= 1 &&
|
||||
(size.height + borderMargin.top + borderMargin.bottom) >= 2 &&
|
||||
(dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) &&
|
||||
(border == BORDER_MODE_CONSTANT ||
|
||||
border == BORDER_MODE_REFLECT ||
|
||||
border == BORDER_MODE_REFLECT101 ||
|
||||
border == BORDER_MODE_REPLICATE );
|
||||
}
|
||||
|
||||
void SeparableFilter3x3(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw,
|
||||
BORDER_MODE border, u8 borderValue, Margin borderMargin)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin));
|
||||
#ifdef CAROTENE_NEON
|
||||
if(!((xw || rowFilter < 3) && (yw || colFilter < 3)))
|
||||
std::abort();//Couldn't call generic filter without provided weights
|
||||
|
||||
typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t,
|
||||
const s16*, const s16*, BORDER_MODE, u8, Margin);
|
||||
|
||||
static sepFilter3x3_8u16s_func quickFilters[4][4]=
|
||||
{
|
||||
/*d0y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_121>::process,
|
||||
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_121>::process,
|
||||
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_121>::process,
|
||||
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_121>::process},
|
||||
|
||||
/*dy */{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_m101>::process,
|
||||
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_m101>::process,
|
||||
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_m101>::process,
|
||||
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_m101>::process},
|
||||
|
||||
/*d2y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_1m21>::process,
|
||||
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_1m21>::process,
|
||||
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_1m21>::process,
|
||||
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_1m21>::process},
|
||||
|
||||
/*dNy*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16Generic>::process,
|
||||
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16Generic>::process,
|
||||
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16Generic>::process,
|
||||
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16Generic>::process}
|
||||
};
|
||||
|
||||
quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride,
|
||||
xw, yw, border, borderValue, borderMargin);
|
||||
#else
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)xw;
|
||||
(void)yw;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
} // namespace CAROTENE_NS
|
1161
3rdparty/carotene/src/separable_filter.hpp
vendored
Normal file
1161
3rdparty/carotene/src/separable_filter.hpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
317
3rdparty/carotene/src/sobel.cpp
vendored
Normal file
317
3rdparty/carotene/src/sobel.cpp
vendored
Normal file
@ -0,0 +1,317 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border,
|
||||
s32 dx, s32 dy, Margin borderMargin)
|
||||
{
|
||||
return dx < 3 && dx >= 0 &&
|
||||
dy < 3 && dy >= 0 &&
|
||||
(dx + dy) > 0 &&
|
||||
isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin);
|
||||
}
|
||||
|
||||
void Sobel3x3(const Size2D &size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
s16 * dstBase, ptrdiff_t dstStride,
|
||||
s32 dx, s32 dy,
|
||||
BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin));
|
||||
#ifdef CAROTENE_NEON
|
||||
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
|
||||
dx, dy, 0, 0,
|
||||
borderType, borderValue, borderMargin);
|
||||
#else
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border,
|
||||
s32 dx, s32 dy)
|
||||
{
|
||||
return isSupportedConfiguration() &&
|
||||
dx < 3 && dx >= 0 &&
|
||||
dy < 3 && dy >= 0 &&
|
||||
(dx + dy) > 0 &&
|
||||
size.width >= 4 && size.height >= 2 &&
|
||||
(border == BORDER_MODE_CONSTANT ||
|
||||
border == BORDER_MODE_REFLECT ||
|
||||
border == BORDER_MODE_REFLECT101 ||
|
||||
border == BORDER_MODE_REPLICATE );
|
||||
}
|
||||
|
||||
void Sobel3x3(const Size2D &size,
|
||||
const f32 * srcBase, ptrdiff_t srcStride,
|
||||
f32 * dstBase, ptrdiff_t dstStride,
|
||||
s32 dx, s32 dy,
|
||||
BORDER_MODE borderType, f32 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy));
|
||||
#ifdef CAROTENE_NEON
|
||||
std::vector<f32> _tmp;
|
||||
f32 *tmp = 0;
|
||||
if (borderType == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
_tmp.assign(size.width + 2, borderValue);
|
||||
tmp = &_tmp[1];
|
||||
}
|
||||
|
||||
ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size
|
||||
std::vector<f32> _tempBuf((delta << 1) + 64);
|
||||
f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32);
|
||||
|
||||
for( size_t y = 0; y < size.height; y++ )
|
||||
{
|
||||
const f32* srow0;
|
||||
const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y);
|
||||
const f32* srow2;
|
||||
f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0);
|
||||
f32* drow1 = internal::getRowPtr(dstBase, dstStride, y);
|
||||
if (borderType == BORDER_MODE_REFLECT101) {
|
||||
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
|
||||
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
|
||||
} else if (borderType == BORDER_MODE_CONSTANT) {
|
||||
srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
|
||||
srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
|
||||
} else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
|
||||
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
|
||||
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
|
||||
}
|
||||
|
||||
float32x4_t tprev = vmovq_n_f32(0.f);
|
||||
float32x4_t tcurr = vmovq_n_f32(0.f);
|
||||
float32x4_t tnext = vmovq_n_f32(0.f);
|
||||
float32x4_t t0, t1, t2;
|
||||
// do vertical convolution
|
||||
size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4);
|
||||
for( ; x <= bcolsn; x += 4 )
|
||||
{
|
||||
internal::prefetch(srow0 + x);
|
||||
internal::prefetch(srow1 + x);
|
||||
internal::prefetch(srow2 + x);
|
||||
|
||||
float32x4_t x0 = vld1q_f32(srow0 + x);
|
||||
float32x4_t x1 = vld1q_f32(srow1 + x);
|
||||
float32x4_t x2 = vld1q_f32(srow2 + x);
|
||||
|
||||
tprev = tcurr;
|
||||
tcurr = tnext;
|
||||
if(!dy)
|
||||
{
|
||||
tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0);
|
||||
}
|
||||
else if(dy == 2)
|
||||
{
|
||||
tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0));
|
||||
}
|
||||
else
|
||||
{
|
||||
tnext = vsubq_f32(x2, x0);
|
||||
}
|
||||
|
||||
if(!x) {
|
||||
tcurr = tnext;
|
||||
// make border
|
||||
if (borderType == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
tcurr = vsetq_lane_f32(borderValue,tcurr, 3);
|
||||
}
|
||||
else if (borderType == BORDER_MODE_REFLECT101)
|
||||
{
|
||||
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3);
|
||||
}
|
||||
else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
|
||||
{
|
||||
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
internal::prefetch(trow0 + x);
|
||||
internal::prefetch(trow1 + x);
|
||||
|
||||
t0 = vextq_f32(tprev, tcurr, 3);
|
||||
t1 = tcurr;
|
||||
t2 = vextq_f32(tcurr, tnext, 1);
|
||||
if(!dx)
|
||||
{
|
||||
t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2));
|
||||
}
|
||||
else if(dx == 2)
|
||||
{
|
||||
t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0));
|
||||
}
|
||||
else
|
||||
{
|
||||
t0 = vsubq_f32(t2, t0);
|
||||
}
|
||||
|
||||
if(!(y%2))
|
||||
{
|
||||
vst1q_f32(trow0 + x - 4, t0);
|
||||
}
|
||||
else
|
||||
{
|
||||
vst1q_f32(trow1 + x - 4, t0);
|
||||
}
|
||||
}
|
||||
x -= 4;
|
||||
if(x == size.width){
|
||||
x--;
|
||||
}
|
||||
f32 prevx = 0, rowx = 0, nextx = 0;
|
||||
if(!dy)
|
||||
{
|
||||
prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] :
|
||||
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] :
|
||||
(borderType == BORDER_MODE_CONSTANT ? 4*borderValue :
|
||||
srow2[0] + 2*srow1[0] + srow0[0]) );
|
||||
rowx = srow2[x] + 2*srow1[x] + srow0[x];
|
||||
}
|
||||
else if(dy == 2)
|
||||
{
|
||||
prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] :
|
||||
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] :
|
||||
(borderType == BORDER_MODE_CONSTANT ? 0.f :
|
||||
srow2[0] - 2*srow1[0] + srow0[0]) );
|
||||
rowx = srow2[x] - 2*srow1[x] + srow0[x];
|
||||
}
|
||||
else
|
||||
{
|
||||
prevx = x > 0 ? srow2[x-1] - srow0[x-1] :
|
||||
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] :
|
||||
(borderType == BORDER_MODE_CONSTANT ? 0.f :
|
||||
srow2[0] - srow0[0]) );
|
||||
rowx = srow2[x] - srow0[x];
|
||||
}
|
||||
|
||||
for( ; x < size.width; x++ )
|
||||
{
|
||||
if(x+1 == size.width) {
|
||||
// make border
|
||||
if (borderType == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
if(!dy) {
|
||||
nextx = 4*borderValue;
|
||||
} else {
|
||||
nextx = 0.f;
|
||||
}
|
||||
} else if (borderType == BORDER_MODE_REFLECT101)
|
||||
{
|
||||
if(!dy) {
|
||||
nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1];
|
||||
} else if(dy == 2) {
|
||||
nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1];
|
||||
} else {
|
||||
nextx = srow2[x-1] - srow0[x-1];
|
||||
}
|
||||
} else {
|
||||
if(!dy) {
|
||||
nextx = srow2[x] + 2*srow1[x] + srow0[x];
|
||||
} else if(dy == 2) {
|
||||
nextx = srow2[x] - 2*srow1[x] + srow0[x];
|
||||
} else {
|
||||
nextx = srow2[x] - srow0[x];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(!dy) {
|
||||
nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1];
|
||||
} else if(dy == 2) {
|
||||
nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1];
|
||||
} else {
|
||||
nextx = srow2[x+1] - srow0[x+1];
|
||||
}
|
||||
}
|
||||
f32 res;
|
||||
if(dx==1) {
|
||||
res = nextx - prevx;
|
||||
} else if(!dx) {
|
||||
res = prevx + 2*rowx + nextx;
|
||||
} else {
|
||||
res = prevx - 2*rowx + nextx;
|
||||
}
|
||||
if(!(y%2)) {
|
||||
*(trow0+x) = res;
|
||||
} else {
|
||||
*(trow1+x) = res;
|
||||
}
|
||||
prevx = rowx;
|
||||
rowx = nextx;
|
||||
}
|
||||
|
||||
if(y>0) {
|
||||
for(size_t x1 = 0; x1 < size.width; x1++ )
|
||||
{
|
||||
if(y%2)
|
||||
*(drow + x1) = trow0[x1];
|
||||
else
|
||||
*(drow + x1) = trow1[x1];
|
||||
}
|
||||
}
|
||||
if(y == size.height-1) {
|
||||
for(size_t x1 = 0; x1 < size.width; x1++ )
|
||||
{
|
||||
if(!(y%2))
|
||||
*(drow1 + x1) = trow0[x1];
|
||||
else
|
||||
*(drow1 + x1) = trow1[x1];
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
621
3rdparty/carotene/src/sub.cpp
vendored
Normal file
621
3rdparty/carotene/src/sub.cpp
vendored
Normal file
@ -0,0 +1,621 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T, typename WT>
|
||||
struct SubWrap
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vsubq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vsub(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = (T)((WT)src0[0] - (WT)src1[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename WT>
|
||||
struct SubSaturate
|
||||
{
|
||||
typedef T type;
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec128 & v_src1,
|
||||
typename internal::VecTraits<T>::vec128 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vqsubq(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
|
||||
const typename internal::VecTraits<T>::vec64 & v_src1,
|
||||
typename internal::VecTraits<T>::vec64 & v_dst) const
|
||||
{
|
||||
v_dst = internal::vqsub(v_src0, v_src1);
|
||||
}
|
||||
|
||||
void operator() (const T * src0, const T * src1, T * dst) const
|
||||
{
|
||||
dst[0] = internal::saturate_cast<T>((WT)src0[0] - (WT)src1[0]);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const u8 * src1Base, ptrdiff_t src1Stride,
|
||||
u8 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubSaturate<u8, s16>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubWrap<u8, s16>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const u8 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i);
|
||||
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw32; j += 32)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
|
||||
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
|
||||
vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
|
||||
vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
|
||||
vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
|
||||
vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
uint8x8_t v_src0 = vld1_u8(src0 + j);
|
||||
uint8x8_t v_src1 = vld1_u8(src1 + j);
|
||||
vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1));
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = (s16)src0[j] - (s16)src1[j];
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const u8 * src1Base, ptrdiff_t src1Stride,
|
||||
f32 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
for (; j < roiw32; j += 32)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
|
||||
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
|
||||
int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00), vget_low_u8(v_src10)));
|
||||
int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
|
||||
|
||||
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
|
||||
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
|
||||
vst1q_f32(dst + j + 8, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
|
||||
vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
|
||||
|
||||
vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01), vget_low_u8(v_src11)));
|
||||
vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
|
||||
|
||||
vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
|
||||
vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
|
||||
vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
|
||||
vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
uint8x8_t v_src0 = vld1_u8(src0 + j);
|
||||
uint8x8_t v_src1 = vld1_u8(src1 + j);
|
||||
|
||||
int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1));
|
||||
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vs) )));
|
||||
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) )));
|
||||
}
|
||||
for(; j < size.width; j++)
|
||||
dst[j] = (f32)src0[j] - (f32)src1[j];
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const u8 * src0Base, ptrdiff_t src0Stride,
|
||||
const s16 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
uint8x16_t v_src0 = vld1q_u8(src0 + j);
|
||||
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
|
||||
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
|
||||
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
|
||||
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
|
||||
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
|
||||
int16x8_t v_src1 = vld1q_s16(src1 + j);
|
||||
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
uint8x16_t v_src0 = vld1q_u8(src0 + j);
|
||||
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
|
||||
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
|
||||
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
|
||||
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
|
||||
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
|
||||
int16x8_t v_src1 = vld1q_s16(src1 + j);
|
||||
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const s16 * src0Base, ptrdiff_t src0Stride,
|
||||
const u8 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
||||
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
||||
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
||||
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||||
size_t j = 0;
|
||||
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
|
||||
uint8x16_t v_src1 = vld1q_u8(src1 + j);
|
||||
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
|
||||
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
|
||||
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
|
||||
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src0 = vld1q_s16(src0 + j);
|
||||
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
|
||||
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; j < roiw16; j += 16)
|
||||
{
|
||||
internal::prefetch(src0 + j);
|
||||
internal::prefetch(src1 + j);
|
||||
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
|
||||
uint8x16_t v_src1 = vld1q_u8(src1 + j);
|
||||
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
|
||||
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
|
||||
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
|
||||
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
|
||||
vst1q_s16(dst + j, v_dst0);
|
||||
vst1q_s16(dst + j + 8, v_dst1);
|
||||
}
|
||||
for (; j < roiw8; j += 8)
|
||||
{
|
||||
int16x8_t v_src0 = vld1q_s16(src0 + j);
|
||||
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
|
||||
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
|
||||
vst1q_s16(dst + j, v_dst);
|
||||
}
|
||||
|
||||
for (; j < size.width; j++)
|
||||
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const s8 * src0Base, ptrdiff_t src0Stride,
|
||||
const s8 * src1Base, ptrdiff_t src1Stride,
|
||||
s8 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubSaturate<s8, s16>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubWrap<s8, s16>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const s16 * src0Base, ptrdiff_t src0Stride,
|
||||
const s16 * src1Base, ptrdiff_t src1Stride,
|
||||
s16 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubSaturate<s16, s32>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubWrap<s16, s32>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const u16 * src0Base, ptrdiff_t src0Stride,
|
||||
const u16 * src1Base, ptrdiff_t src1Stride,
|
||||
u16 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubSaturate<u16, s32>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubWrap<u16, s32>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const s32 * src0Base, ptrdiff_t src0Stride,
|
||||
const s32 * src1Base, ptrdiff_t src1Stride,
|
||||
s32 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubSaturate<s32, s64>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubWrap<s32, s64>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const u32 * src0Base, ptrdiff_t src0Stride,
|
||||
const u32 * src1Base, ptrdiff_t src1Stride,
|
||||
u32 *dstBase, ptrdiff_t dstStride,
|
||||
CONVERT_POLICY policy)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
if (policy == CONVERT_POLICY_SATURATE)
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubSaturate<u32, s64>());
|
||||
}
|
||||
else
|
||||
{
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubWrap<u32, s64>());
|
||||
}
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)policy;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sub(const Size2D &size,
|
||||
const f32 * src0Base, ptrdiff_t src0Stride,
|
||||
const f32 * src1Base, ptrdiff_t src1Stride,
|
||||
f32 *dstBase, ptrdiff_t dstStride)
|
||||
{
|
||||
internal::assertSupportedConfiguration();
|
||||
#ifdef CAROTENE_NEON
|
||||
internal::vtransform(size,
|
||||
src0Base, src0Stride,
|
||||
src1Base, src1Stride,
|
||||
dstBase, dstStride,
|
||||
SubWrap<f32, f32>());
|
||||
#else
|
||||
(void)size;
|
||||
(void)src0Base;
|
||||
(void)src0Stride;
|
||||
(void)src1Base;
|
||||
(void)src1Stride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
385
3rdparty/carotene/src/sum.cpp
vendored
Normal file
385
3rdparty/carotene/src/sum.cpp
vendored
Normal file
@ -0,0 +1,385 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "vtransform.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isSumSupported(u32 channels)
|
||||
{
|
||||
return (channels && channels < 5);
|
||||
}
|
||||
|
||||
void sum(const Size2D &_size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
u32 * sumdst, u32 channels)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isSumSupported(channels));
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (srcStride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
const ptrdiff_t width = size.width * channels;
|
||||
|
||||
for(size_t k = 0; k < size.height; ++k)
|
||||
{
|
||||
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
|
||||
ptrdiff_t i = 0;
|
||||
|
||||
if (channels == 3)
|
||||
{
|
||||
uint32x4_t vs1231 = vdupq_n_u32(0);
|
||||
uint32x4_t vs3123 = vdupq_n_u32(0);
|
||||
uint32x4_t vs2312 = vdupq_n_u32(0);
|
||||
for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3)
|
||||
{
|
||||
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
|
||||
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
|
||||
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
|
||||
|
||||
for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3)
|
||||
{
|
||||
internal::prefetch(src + j + 24);
|
||||
s1 = vaddw_u8(s1, vld1_u8(src + j + 0));
|
||||
s2 = vaddw_u8(s2, vld1_u8(src + j + 8));
|
||||
s3 = vaddw_u8(s3, vld1_u8(src + j + 16));
|
||||
}
|
||||
|
||||
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
|
||||
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
|
||||
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
|
||||
}
|
||||
if (i <= width - 8*3)
|
||||
{
|
||||
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
|
||||
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
|
||||
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
|
||||
|
||||
for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3)
|
||||
{
|
||||
internal::prefetch(src + 24);
|
||||
s1 = vaddw_u8(s1, vld1_u8(src + 0));
|
||||
s2 = vaddw_u8(s2, vld1_u8(src + 8));
|
||||
s3 = vaddw_u8(s3, vld1_u8(src + 16));
|
||||
}
|
||||
|
||||
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
|
||||
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
|
||||
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
|
||||
}
|
||||
|
||||
u32 sum[12];
|
||||
vst1q_u32(sum+0, vs1231);
|
||||
vst1q_u32(sum+4, vs2312);
|
||||
vst1q_u32(sum+8, vs3123);
|
||||
|
||||
for (; i < width; i += 3, src += 3)
|
||||
{
|
||||
sumdst[0] += src[0];
|
||||
sumdst[1] += src[1];
|
||||
sumdst[2] += src[2];
|
||||
}
|
||||
|
||||
sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9];
|
||||
sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10];
|
||||
sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11];
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32x4_t vs = vdupq_n_u32(0);
|
||||
for (; i <= width - 257*8; i += 257*8, src += 257 * 8)
|
||||
{
|
||||
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
|
||||
|
||||
for (int j = 8; j < 257 * 8; j += 8)
|
||||
{
|
||||
internal::prefetch(src + j);
|
||||
s1 = vaddw_u8(s1, vld1_u8(src + j));
|
||||
}
|
||||
|
||||
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
|
||||
}
|
||||
if (i < width - 7)
|
||||
{
|
||||
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
|
||||
|
||||
for(i+=8,src+=8; i < width-7; i+=8,src+=8)
|
||||
{
|
||||
internal::prefetch(src);
|
||||
s1 = vaddw_u8(s1, vld1_u8(src));
|
||||
}
|
||||
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
|
||||
}
|
||||
|
||||
if (channels == 1)
|
||||
{
|
||||
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
|
||||
uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2));
|
||||
|
||||
u32 s0 = vget_lane_u32(vs1, 0);
|
||||
for(; i < width; ++i,++src)
|
||||
s0 += src[0];
|
||||
sumdst[0] += s0;
|
||||
}
|
||||
else if (channels == 4)
|
||||
{
|
||||
vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst)));
|
||||
|
||||
for(; i < width; i+=4,src+=4)
|
||||
{
|
||||
sumdst[0] += src[0];
|
||||
sumdst[1] += src[1];
|
||||
sumdst[2] += src[2];
|
||||
sumdst[3] += src[3];
|
||||
}
|
||||
}
|
||||
else//if (channels == 2)
|
||||
{
|
||||
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
|
||||
vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst)));
|
||||
|
||||
for(; i < width; i+=2,src+=2)
|
||||
{
|
||||
sumdst[0] += src[0];
|
||||
sumdst[1] += src[1];
|
||||
}
|
||||
}
|
||||
}//channels != 3
|
||||
}
|
||||
#else
|
||||
(void)_size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)sumdst;
|
||||
(void)channels;
|
||||
#endif
|
||||
}
|
||||
|
||||
void sum(const Size2D &_size,
|
||||
const f32 * srcBase, ptrdiff_t srcStride,
|
||||
f64 * sumdst, u32 channels)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isSumSupported(channels));
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (srcStride == (ptrdiff_t)(size.width))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
const ptrdiff_t width = size.width * channels;
|
||||
|
||||
for(size_t k = 0; k < size.height; ++k)
|
||||
{
|
||||
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
|
||||
ptrdiff_t i = 0;
|
||||
|
||||
if (channels == 3)
|
||||
{
|
||||
float32x4_t vs1231 = vdupq_n_f32(0);
|
||||
float32x4_t vs2312 = vdupq_n_f32(0);
|
||||
float32x4_t vs3123 = vdupq_n_f32(0);
|
||||
for(; i <= width-12; i += 12)
|
||||
{
|
||||
internal::prefetch(src + i + 12);
|
||||
vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0));
|
||||
vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4));
|
||||
vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8));
|
||||
}
|
||||
|
||||
f32 s[12];
|
||||
vst1q_f32(s + 0, vs1231);
|
||||
vst1q_f32(s + 4, vs2312);
|
||||
vst1q_f32(s + 8, vs3123);
|
||||
|
||||
sumdst[0] += s[0] + s[3] + s[6] + s[9];
|
||||
sumdst[1] += s[1] + s[4] + s[7] + s[10];
|
||||
sumdst[2] += s[2] + s[5] + s[8] + s[11];
|
||||
for( ; i < width; i+=3)
|
||||
{
|
||||
sumdst[0] += src[i];
|
||||
sumdst[1] += src[i+1];
|
||||
sumdst[2] += src[i+2];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
float32x4_t vs = vdupq_n_f32(0);
|
||||
for(; i <= width-4; i += 4)
|
||||
{
|
||||
internal::prefetch(src + i);
|
||||
vs = vaddq_f32(vs, vld1q_f32(src+i));
|
||||
}
|
||||
|
||||
if (channels == 1)
|
||||
{
|
||||
float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs));
|
||||
f32 s[2];
|
||||
vst1_f32(s, vs2);
|
||||
|
||||
sumdst[0] += s[0] + s[1];
|
||||
for( ; i < width; i++)
|
||||
sumdst[0] += src[i];
|
||||
}
|
||||
else if (channels == 4)
|
||||
{
|
||||
f32 s[4];
|
||||
vst1q_f32(s, vs);
|
||||
|
||||
sumdst[0] += s[0];
|
||||
sumdst[1] += s[1];
|
||||
sumdst[2] += s[2];
|
||||
sumdst[3] += s[3];
|
||||
}
|
||||
else//if (channels == 2)
|
||||
{
|
||||
float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs));
|
||||
f32 s[2];
|
||||
vst1_f32(s, vs2);
|
||||
|
||||
sumdst[0] += s[0];
|
||||
sumdst[1] += s[1];
|
||||
|
||||
if(i < width)
|
||||
{
|
||||
sumdst[0] += src[i];
|
||||
sumdst[1] += src[i+1];
|
||||
}
|
||||
}
|
||||
}//channels != 3
|
||||
}
|
||||
#else
|
||||
(void)_size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)sumdst;
|
||||
(void)channels;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool isSqsumSupported(u32 channels)
|
||||
{
|
||||
return (channels && ((4/channels)*channels == 4));
|
||||
}
|
||||
|
||||
void sqsum(const Size2D &_size,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
f64 * sumdst, f64 * sqsumdst, u32 channels)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isSqsumSupported(channels));
|
||||
#ifdef CAROTENE_NEON
|
||||
Size2D size(_size);
|
||||
if (srcStride == (ptrdiff_t)(size.width*channels))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
const size_t width = size.width * channels;
|
||||
|
||||
size_t blockSize0 = 1 << 23;
|
||||
size_t roiw8 = width & ~7;
|
||||
|
||||
uint32x4_t v_zero = vdupq_n_u32(0u);
|
||||
|
||||
for (size_t i = 0; i < size.height; ++i)
|
||||
{
|
||||
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
|
||||
size_t j = 0u;
|
||||
|
||||
while (j < roiw8)
|
||||
{
|
||||
size_t blockSize = std::min(roiw8 - j, blockSize0) + j;
|
||||
uint32x4_t v_sum = v_zero;
|
||||
uint32x4_t v_sqsum = v_zero;
|
||||
|
||||
for ( ; j < blockSize ; j += 8, src += 8)
|
||||
{
|
||||
internal::prefetch(src);
|
||||
uint8x8_t v_src0 = vld1_u8(src);
|
||||
|
||||
uint16x8_t v_src = vmovl_u8(v_src0);
|
||||
uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src);
|
||||
v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi));
|
||||
v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo);
|
||||
v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi);
|
||||
}
|
||||
|
||||
u32 arsum[8];
|
||||
vst1q_u32(arsum, v_sum);
|
||||
vst1q_u32(arsum + 4, v_sqsum);
|
||||
|
||||
sumdst[0] += (f64)arsum[0];
|
||||
sumdst[1 % channels] += (f64)arsum[1];
|
||||
sumdst[2 % channels] += (f64)arsum[2];
|
||||
sumdst[3 % channels] += (f64)arsum[3];
|
||||
sqsumdst[0] += (f64)arsum[4];
|
||||
sqsumdst[1 % channels] += (f64)arsum[5];
|
||||
sqsumdst[2 % channels] += (f64)arsum[6];
|
||||
sqsumdst[3 % channels] += (f64)arsum[7];
|
||||
}
|
||||
// collect a few last elements in the current row
|
||||
// it's ok to process channels elements per step
|
||||
// since we could handle 1,2 or 4 channels
|
||||
// we always have channels-fold amount of elements remaining
|
||||
for ( ; j < width; j+=channels, src+=channels)
|
||||
{
|
||||
for (u32 kk = 0; kk < channels; kk++)
|
||||
{
|
||||
u32 srcval = src[kk];
|
||||
sumdst[kk] += srcval;
|
||||
sqsumdst[kk] += srcval * srcval;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)_size;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)sumdst;
|
||||
(void)sqsumdst;
|
||||
(void)channels;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
241
3rdparty/carotene/src/template_matching.cpp
vendored
Normal file
241
3rdparty/carotene/src/template_matching.cpp
vendored
Normal file
@ -0,0 +1,241 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than
|
||||
//time for simultaneous 2 lines matching for the same amount of data
|
||||
|
||||
bool isMatchTemplateSupported(const Size2D &tmplSize)
|
||||
{
|
||||
return isSupportedConfiguration() &&
|
||||
tmplSize.width >= 8 && // Actually the function could process even shorter templates
|
||||
// but there will be no NEON optimization in this case
|
||||
(tmplSize.width * tmplSize.height) <= 256;
|
||||
}
|
||||
|
||||
void matchTemplate(const Size2D &srcSize,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
const Size2D &tmplSize,
|
||||
const u8 * tmplBase, ptrdiff_t tmplStride,
|
||||
f32 * dstBase, ptrdiff_t dstStride,
|
||||
bool normalize)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize));
|
||||
#ifdef CAROTENE_NEON
|
||||
const size_t tmplW = tmplSize.width;
|
||||
const size_t tmplH = tmplSize.height;
|
||||
const size_t dstW = srcSize.width - tmplSize.width + 1;
|
||||
const size_t dstH = srcSize.height - tmplSize.height + 1;
|
||||
|
||||
//template correlation part
|
||||
{
|
||||
#if ENABLE4LINESMATCHING
|
||||
const size_t dstroiw4 = dstW & ~3u;
|
||||
#endif
|
||||
const size_t dstroiw2 = dstW & ~1u;
|
||||
const size_t tmplroiw = tmplW & ~7u;
|
||||
const size_t dstride = dstStride >> 2;
|
||||
|
||||
f32 *corr = dstBase;
|
||||
const u8 *imgrrow = srcBase;
|
||||
for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
|
||||
{
|
||||
size_t c = 0;
|
||||
#if ENABLE4LINESMATCHING
|
||||
for(; c < dstroiw4; c+=4)
|
||||
{
|
||||
u32 dot[4] = {0, 0, 0, 0};
|
||||
uint32x4_t vdot0 = vmovq_n_u32(0);
|
||||
uint32x4_t vdot1 = vmovq_n_u32(0);
|
||||
uint32x4_t vdot2 = vmovq_n_u32(0);
|
||||
uint32x4_t vdot3 = vmovq_n_u32(0);
|
||||
|
||||
const u8 *img = imgrrow;
|
||||
const u8 *tmpl = tmplBase;
|
||||
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
|
||||
{
|
||||
size_t j = 0;
|
||||
for(; j < tmplroiw; j+=8)
|
||||
{
|
||||
uint8x8_t vtmpl = vld1_u8(tmpl + j);
|
||||
|
||||
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
|
||||
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
|
||||
uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
|
||||
uint8x8_t vimg3 = vld1_u8(img + j + c + 3);
|
||||
|
||||
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
|
||||
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
|
||||
uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
|
||||
uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);
|
||||
|
||||
vdot0 = vpadalq_u16(vdot0, vd0);
|
||||
vdot1 = vpadalq_u16(vdot1, vd1);
|
||||
vdot2 = vpadalq_u16(vdot2, vd2);
|
||||
vdot3 = vpadalq_u16(vdot3, vd3);
|
||||
}
|
||||
for(; j < tmplW; ++j)
|
||||
{
|
||||
dot[0] += tmpl[j] * img[j + c + 0];
|
||||
dot[1] += tmpl[j] * img[j + c + 1];
|
||||
dot[2] += tmpl[j] * img[j + c + 2];
|
||||
dot[3] += tmpl[j] * img[j + c + 3];
|
||||
}
|
||||
}
|
||||
uint32x4_t vdotx = vld1q_u32(dot);
|
||||
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
|
||||
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
|
||||
uint32x2_t vdot_2 = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
|
||||
uint32x2_t vdot_3 = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
|
||||
uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
|
||||
uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);
|
||||
|
||||
vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
|
||||
}
|
||||
#endif
|
||||
|
||||
for(; c < dstroiw2; c+=2)
|
||||
{
|
||||
u32 dot[2] = {0, 0};
|
||||
uint32x4_t vdot0 = vmovq_n_u32(0);
|
||||
uint32x4_t vdot1 = vmovq_n_u32(0);
|
||||
const u8 *img = imgrrow;
|
||||
const u8 *tmpl = tmplBase;
|
||||
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
|
||||
{
|
||||
size_t j = 0;
|
||||
for(; j < tmplroiw; j+=8)
|
||||
{
|
||||
uint8x8_t vtmpl = vld1_u8(tmpl + j);
|
||||
|
||||
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
|
||||
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
|
||||
|
||||
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
|
||||
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
|
||||
|
||||
vdot0 = vpadalq_u16(vdot0, vd0);
|
||||
vdot1 = vpadalq_u16(vdot1, vd1);
|
||||
}
|
||||
for(; j < tmplW; ++j)
|
||||
{
|
||||
dot[0] += tmpl[j] * img[j + c + 0];
|
||||
dot[1] += tmpl[j] * img[j + c + 1];
|
||||
}
|
||||
}
|
||||
uint32x2_t vdotx = vld1_u32(dot);
|
||||
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
|
||||
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
|
||||
uint32x2_t vdot_ = vpadd_u32(vdot_0, vdot_1);
|
||||
vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
|
||||
}
|
||||
|
||||
for(; c < dstW; ++c)
|
||||
{
|
||||
u32 dot = 0;
|
||||
uint32x4_t vdot = vmovq_n_u32(0);
|
||||
const u8 *img = imgrrow;
|
||||
const u8 *tmpl = tmplBase;
|
||||
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
|
||||
{
|
||||
size_t j = 0;
|
||||
for(; j < tmplroiw; j+=8)
|
||||
{
|
||||
uint8x8_t vtmpl = vld1_u8(tmpl + j);
|
||||
uint8x8_t vimg = vld1_u8(img + j + c);
|
||||
uint16x8_t vd = vmull_u8(vtmpl, vimg);
|
||||
vdot = vpadalq_u16(vdot, vd);
|
||||
}
|
||||
for(; j < tmplW; ++j)
|
||||
dot += tmpl[j] * img[j + c];
|
||||
}
|
||||
u32 wdot[2];
|
||||
vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
|
||||
dot += wdot[0] + wdot[1];
|
||||
corr[c] = (f32)dot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(normalize)
|
||||
{
|
||||
f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
|
||||
|
||||
size_t iw = srcSize.width+1;
|
||||
size_t ih = srcSize.height+1;
|
||||
std::vector<f64> _sqsum(iw*ih);
|
||||
f64 *sqsum = &_sqsum[0];
|
||||
memset(sqsum, 0, iw*sizeof(f64));
|
||||
for(size_t i = 1; i < ih; ++i)
|
||||
sqsum[iw*i] = 0.;
|
||||
sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));
|
||||
|
||||
for(size_t i = 0; i < dstH; ++i)
|
||||
{
|
||||
f32 *result = internal::getRowPtr(dstBase, dstStride, i);
|
||||
for(size_t j = 0; j < dstW; ++j)
|
||||
{
|
||||
double s2 = sqsum[iw*i + j] +
|
||||
sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
|
||||
sqsum[iw*(i + tmplSize.height) + j] -
|
||||
sqsum[iw*i + j + tmplSize.width];
|
||||
|
||||
result[j] /= tn * std::sqrt(s2);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)srcSize;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)tmplBase;
|
||||
(void)tmplStride;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)normalize;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
1627
3rdparty/carotene/src/threshold.cpp
vendored
Normal file
1627
3rdparty/carotene/src/threshold.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
689
3rdparty/carotene/src/vtransform.hpp
vendored
Normal file
689
3rdparty/carotene/src/vtransform.hpp
vendored
Normal file
@ -0,0 +1,689 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#ifndef CAROTENE_SRC_VTRANSFORM_HPP
|
||||
#define CAROTENE_SRC_VTRANSFORM_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <carotene/types.hpp>
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
namespace CAROTENE_NS { namespace internal {
|
||||
|
||||
////////////////////////////// Type Traits ///////////////////////
|
||||
|
||||
template <typename T, int cn = 1>
|
||||
struct VecTraits;
|
||||
|
||||
template <> struct VecTraits< u8, 1> { typedef uint8x16_t vec128; typedef uint8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
|
||||
template <> struct VecTraits< s8, 1> { typedef int8x16_t vec128; typedef int8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
|
||||
template <> struct VecTraits<u16, 1> { typedef uint16x8_t vec128; typedef uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
|
||||
template <> struct VecTraits<s16, 1> { typedef int16x8_t vec128; typedef int16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
|
||||
template <> struct VecTraits<s32, 1> { typedef int32x4_t vec128; typedef int32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
|
||||
template <> struct VecTraits<u32, 1> { typedef uint32x4_t vec128; typedef uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
|
||||
template <> struct VecTraits<s64, 1> { typedef int64x2_t vec128; typedef int64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
|
||||
template <> struct VecTraits<u64, 1> { typedef uint64x2_t vec128; typedef uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
|
||||
template <> struct VecTraits<f32, 1> { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
|
||||
|
||||
template <> struct VecTraits< u8, 2> { typedef uint8x16x2_t vec128; typedef uint8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
|
||||
template <> struct VecTraits< s8, 2> { typedef int8x16x2_t vec128; typedef int8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
|
||||
template <> struct VecTraits<u16, 2> { typedef uint16x8x2_t vec128; typedef uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
|
||||
template <> struct VecTraits<s16, 2> { typedef int16x8x2_t vec128; typedef int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
|
||||
template <> struct VecTraits<s32, 2> { typedef int32x4x2_t vec128; typedef int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
|
||||
template <> struct VecTraits<u32, 2> { typedef uint32x4x2_t vec128; typedef uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
|
||||
template <> struct VecTraits<s64, 2> { typedef int64x2x2_t vec128; typedef int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
|
||||
template <> struct VecTraits<u64, 2> { typedef uint64x2x2_t vec128; typedef uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
|
||||
template <> struct VecTraits<f32, 2> { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
|
||||
|
||||
template <> struct VecTraits< u8, 3> { typedef uint8x16x3_t vec128; typedef uint8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
|
||||
template <> struct VecTraits< s8, 3> { typedef int8x16x3_t vec128; typedef int8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
|
||||
template <> struct VecTraits<u16, 3> { typedef uint16x8x3_t vec128; typedef uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
|
||||
template <> struct VecTraits<s16, 3> { typedef int16x8x3_t vec128; typedef int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
|
||||
template <> struct VecTraits<s32, 3> { typedef int32x4x3_t vec128; typedef int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
|
||||
template <> struct VecTraits<u32, 3> { typedef uint32x4x3_t vec128; typedef uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
|
||||
template <> struct VecTraits<s64, 3> { typedef int64x2x3_t vec128; typedef int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
|
||||
template <> struct VecTraits<u64, 3> { typedef uint64x2x3_t vec128; typedef uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
|
||||
template <> struct VecTraits<f32, 3> { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
|
||||
|
||||
template <> struct VecTraits< u8, 4> { typedef uint8x16x4_t vec128; typedef uint8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
|
||||
template <> struct VecTraits< s8, 4> { typedef int8x16x4_t vec128; typedef int8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
|
||||
template <> struct VecTraits<u16, 4> { typedef uint16x8x4_t vec128; typedef uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
|
||||
template <> struct VecTraits<s16, 4> { typedef int16x8x4_t vec128; typedef int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
|
||||
template <> struct VecTraits<s32, 4> { typedef int32x4x4_t vec128; typedef int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
|
||||
template <> struct VecTraits<u32, 4> { typedef uint32x4x4_t vec128; typedef uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
|
||||
template <> struct VecTraits<s64, 4> { typedef int64x2x4_t vec128; typedef int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
|
||||
template <> struct VecTraits<u64, 4> { typedef uint64x2x4_t vec128; typedef uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
|
||||
template <> struct VecTraits<f32, 4> { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
|
||||
|
||||
////////////////////////////// vld1q ///////////////////////
|
||||
|
||||
inline uint8x16_t vld1q(const u8 * ptr) { return vld1q_u8(ptr); }
|
||||
inline int8x16_t vld1q(const s8 * ptr) { return vld1q_s8(ptr); }
|
||||
inline uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }
|
||||
inline int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); }
|
||||
inline uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }
|
||||
inline int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
|
||||
inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); }
|
||||
|
||||
////////////////////////////// vld1 ///////////////////////
|
||||
|
||||
inline uint8x8_t vld1(const u8 * ptr) { return vld1_u8(ptr); }
|
||||
inline int8x8_t vld1(const s8 * ptr) { return vld1_s8(ptr); }
|
||||
inline uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); }
|
||||
inline int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); }
|
||||
inline uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); }
|
||||
inline int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
|
||||
inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }
|
||||
|
||||
////////////////////////////// vld2q ///////////////////////
|
||||
|
||||
inline uint8x16x2_t vld2q(const u8 * ptr) { return vld2q_u8(ptr); }
|
||||
inline int8x16x2_t vld2q(const s8 * ptr) { return vld2q_s8(ptr); }
|
||||
inline uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); }
|
||||
inline int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); }
|
||||
inline uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); }
|
||||
inline int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); }
|
||||
inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); }
|
||||
|
||||
////////////////////////////// vld2 ///////////////////////
|
||||
|
||||
inline uint8x8x2_t vld2(const u8 * ptr) { return vld2_u8(ptr); }
|
||||
inline int8x8x2_t vld2(const s8 * ptr) { return vld2_s8(ptr); }
|
||||
inline uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); }
|
||||
inline int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
|
||||
inline uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); }
|
||||
inline int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); }
|
||||
inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); }
|
||||
|
||||
////////////////////////////// vld3q ///////////////////////
|
||||
|
||||
inline uint8x16x3_t vld3q(const u8 * ptr) { return vld3q_u8(ptr); }
|
||||
inline int8x16x3_t vld3q(const s8 * ptr) { return vld3q_s8(ptr); }
|
||||
inline uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); }
|
||||
inline int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); }
|
||||
inline uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); }
|
||||
inline int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); }
|
||||
inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); }
|
||||
|
||||
////////////////////////////// vld3 ///////////////////////
|
||||
|
||||
inline uint8x8x3_t vld3(const u8 * ptr) { return vld3_u8(ptr); }
|
||||
inline int8x8x3_t vld3(const s8 * ptr) { return vld3_s8(ptr); }
|
||||
inline uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); }
|
||||
inline int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); }
|
||||
inline uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); }
|
||||
inline int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); }
|
||||
inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); }
|
||||
|
||||
////////////////////////////// vld4q ///////////////////////
|
||||
|
||||
inline uint8x16x4_t vld4q(const u8 * ptr) { return vld4q_u8(ptr); }
|
||||
inline int8x16x4_t vld4q(const s8 * ptr) { return vld4q_s8(ptr); }
|
||||
inline uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); }
|
||||
inline int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); }
|
||||
inline uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); }
|
||||
inline int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); }
|
||||
inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); }
|
||||
|
||||
////////////////////////////// vld4 ///////////////////////
|
||||
|
||||
inline uint8x8x4_t vld4(const u8 * ptr) { return vld4_u8(ptr); }
|
||||
inline int8x8x4_t vld4(const s8 * ptr) { return vld4_s8(ptr); }
|
||||
inline uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); }
|
||||
inline int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); }
|
||||
inline uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); }
|
||||
inline int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); }
|
||||
inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); }
|
||||
|
||||
////////////////////////////// vst1q ///////////////////////
|
||||
|
||||
inline void vst1q(u8 * ptr, const uint8x16_t & v) { return vst1q_u8(ptr, v); }
|
||||
inline void vst1q(s8 * ptr, const int8x16_t & v) { return vst1q_s8(ptr, v); }
|
||||
inline void vst1q(u16 * ptr, const uint16x8_t & v) { return vst1q_u16(ptr, v); }
|
||||
inline void vst1q(s16 * ptr, const int16x8_t & v) { return vst1q_s16(ptr, v); }
|
||||
inline void vst1q(u32 * ptr, const uint32x4_t & v) { return vst1q_u32(ptr, v); }
|
||||
inline void vst1q(s32 * ptr, const int32x4_t & v) { return vst1q_s32(ptr, v); }
|
||||
inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
|
||||
|
||||
////////////////////////////// vst1 ///////////////////////
|
||||
|
||||
inline void vst1(u8 * ptr, const uint8x8_t & v) { return vst1_u8(ptr, v); }
|
||||
inline void vst1(s8 * ptr, const int8x8_t & v) { return vst1_s8(ptr, v); }
|
||||
inline void vst1(u16 * ptr, const uint16x4_t & v) { return vst1_u16(ptr, v); }
|
||||
inline void vst1(s16 * ptr, const int16x4_t & v) { return vst1_s16(ptr, v); }
|
||||
inline void vst1(u32 * ptr, const uint32x2_t & v) { return vst1_u32(ptr, v); }
|
||||
inline void vst1(s32 * ptr, const int32x2_t & v) { return vst1_s32(ptr, v); }
|
||||
inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }
|
||||
|
||||
////////////////////////////// vst2q ///////////////////////
|
||||
|
||||
inline void vst2q(u8 * ptr, const uint8x16x2_t & v) { return vst2q_u8(ptr, v); }
|
||||
inline void vst2q(s8 * ptr, const int8x16x2_t & v) { return vst2q_s8(ptr, v); }
|
||||
inline void vst2q(u16 * ptr, const uint16x8x2_t & v) { return vst2q_u16(ptr, v); }
|
||||
inline void vst2q(s16 * ptr, const int16x8x2_t & v) { return vst2q_s16(ptr, v); }
|
||||
inline void vst2q(u32 * ptr, const uint32x4x2_t & v) { return vst2q_u32(ptr, v); }
|
||||
inline void vst2q(s32 * ptr, const int32x4x2_t & v) { return vst2q_s32(ptr, v); }
|
||||
inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); }
|
||||
|
||||
////////////////////////////// vst2 ///////////////////////
|
||||
|
||||
inline void vst2(u8 * ptr, const uint8x8x2_t & v) { return vst2_u8(ptr, v); }
|
||||
inline void vst2(s8 * ptr, const int8x8x2_t & v) { return vst2_s8(ptr, v); }
|
||||
inline void vst2(u16 * ptr, const uint16x4x2_t & v) { return vst2_u16(ptr, v); }
|
||||
inline void vst2(s16 * ptr, const int16x4x2_t & v) { return vst2_s16(ptr, v); }
|
||||
inline void vst2(u32 * ptr, const uint32x2x2_t & v) { return vst2_u32(ptr, v); }
|
||||
inline void vst2(s32 * ptr, const int32x2x2_t & v) { return vst2_s32(ptr, v); }
|
||||
inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); }
|
||||
|
||||
////////////////////////////// vst3q ///////////////////////
|
||||
|
||||
inline void vst3q(u8 * ptr, const uint8x16x3_t & v) { return vst3q_u8(ptr, v); }
|
||||
inline void vst3q(s8 * ptr, const int8x16x3_t & v) { return vst3q_s8(ptr, v); }
|
||||
inline void vst3q(u16 * ptr, const uint16x8x3_t & v) { return vst3q_u16(ptr, v); }
|
||||
inline void vst3q(s16 * ptr, const int16x8x3_t & v) { return vst3q_s16(ptr, v); }
|
||||
inline void vst3q(u32 * ptr, const uint32x4x3_t & v) { return vst3q_u32(ptr, v); }
|
||||
inline void vst3q(s32 * ptr, const int32x4x3_t & v) { return vst3q_s32(ptr, v); }
|
||||
inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); }
|
||||
|
||||
////////////////////////////// vst3 ///////////////////////
|
||||
|
||||
inline void vst3(u8 * ptr, const uint8x8x3_t & v) { return vst3_u8(ptr, v); }
|
||||
inline void vst3(s8 * ptr, const int8x8x3_t & v) { return vst3_s8(ptr, v); }
|
||||
inline void vst3(u16 * ptr, const uint16x4x3_t & v) { return vst3_u16(ptr, v); }
|
||||
inline void vst3(s16 * ptr, const int16x4x3_t & v) { return vst3_s16(ptr, v); }
|
||||
inline void vst3(u32 * ptr, const uint32x2x3_t & v) { return vst3_u32(ptr, v); }
|
||||
inline void vst3(s32 * ptr, const int32x2x3_t & v) { return vst3_s32(ptr, v); }
|
||||
inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); }
|
||||
|
||||
////////////////////////////// vst4q ///////////////////////
|
||||
|
||||
inline void vst4q(u8 * ptr, const uint8x16x4_t & v) { return vst4q_u8(ptr, v); }
|
||||
inline void vst4q(s8 * ptr, const int8x16x4_t & v) { return vst4q_s8(ptr, v); }
|
||||
inline void vst4q(u16 * ptr, const uint16x8x4_t & v) { return vst4q_u16(ptr, v); }
|
||||
inline void vst4q(s16 * ptr, const int16x8x4_t & v) { return vst4q_s16(ptr, v); }
|
||||
inline void vst4q(u32 * ptr, const uint32x4x4_t & v) { return vst4q_u32(ptr, v); }
|
||||
inline void vst4q(s32 * ptr, const int32x4x4_t & v) { return vst4q_s32(ptr, v); }
|
||||
inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); }
|
||||
|
||||
////////////////////////////// vst4 ///////////////////////
|
||||
|
||||
inline void vst4(u8 * ptr, const uint8x8x4_t & v) { return vst4_u8(ptr, v); }
|
||||
inline void vst4(s8 * ptr, const int8x8x4_t & v) { return vst4_s8(ptr, v); }
|
||||
inline void vst4(u16 * ptr, const uint16x4x4_t & v) { return vst4_u16(ptr, v); }
|
||||
inline void vst4(s16 * ptr, const int16x4x4_t & v) { return vst4_s16(ptr, v); }
|
||||
inline void vst4(u32 * ptr, const uint32x2x4_t & v) { return vst4_u32(ptr, v); }
|
||||
inline void vst4(s32 * ptr, const int32x2x4_t & v) { return vst4_s32(ptr, v); }
|
||||
inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); }
|
||||
|
||||
////////////////////////////// vabdq ///////////////////////
|
||||
|
||||
inline uint8x16_t vabdq(const uint8x16_t & v0, const uint8x16_t & v1) { return vabdq_u8 (v0, v1); }
|
||||
inline int8x16_t vabdq(const int8x16_t & v0, const int8x16_t & v1) { return vabdq_s8 (v0, v1); }
|
||||
inline uint16x8_t vabdq(const uint16x8_t & v0, const uint16x8_t & v1) { return vabdq_u16(v0, v1); }
|
||||
inline int16x8_t vabdq(const int16x8_t & v0, const int16x8_t & v1) { return vabdq_s16(v0, v1); }
|
||||
inline uint32x4_t vabdq(const uint32x4_t & v0, const uint32x4_t & v1) { return vabdq_u32(v0, v1); }
|
||||
inline int32x4_t vabdq(const int32x4_t & v0, const int32x4_t & v1) { return vabdq_s32(v0, v1); }
|
||||
inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vabd ///////////////////////
|
||||
|
||||
inline uint8x8_t vabd(const uint8x8_t & v0, const uint8x8_t & v1) { return vabd_u8 (v0, v1); }
|
||||
inline int8x8_t vabd(const int8x8_t & v0, const int8x8_t & v1) { return vabd_s8 (v0, v1); }
|
||||
inline uint16x4_t vabd(const uint16x4_t & v0, const uint16x4_t & v1) { return vabd_u16(v0, v1); }
|
||||
inline int16x4_t vabd(const int16x4_t & v0, const int16x4_t & v1) { return vabd_s16(v0, v1); }
|
||||
inline uint32x2_t vabd(const uint32x2_t & v0, const uint32x2_t & v1) { return vabd_u32(v0, v1); }
|
||||
inline int32x2_t vabd(const int32x2_t & v0, const int32x2_t & v1) { return vabd_s32(v0, v1); }
|
||||
inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vminq ///////////////////////
|
||||
|
||||
inline uint8x16_t vminq(const uint8x16_t & v0, const uint8x16_t & v1) { return vminq_u8 (v0, v1); }
|
||||
inline int8x16_t vminq(const int8x16_t & v0, const int8x16_t & v1) { return vminq_s8 (v0, v1); }
|
||||
inline uint16x8_t vminq(const uint16x8_t & v0, const uint16x8_t & v1) { return vminq_u16(v0, v1); }
|
||||
inline int16x8_t vminq(const int16x8_t & v0, const int16x8_t & v1) { return vminq_s16(v0, v1); }
|
||||
inline uint32x4_t vminq(const uint32x4_t & v0, const uint32x4_t & v1) { return vminq_u32(v0, v1); }
|
||||
inline int32x4_t vminq(const int32x4_t & v0, const int32x4_t & v1) { return vminq_s32(v0, v1); }
|
||||
inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vmin ///////////////////////
|
||||
|
||||
inline uint8x8_t vmin(const uint8x8_t & v0, const uint8x8_t & v1) { return vmin_u8 (v0, v1); }
|
||||
inline int8x8_t vmin(const int8x8_t & v0, const int8x8_t & v1) { return vmin_s8 (v0, v1); }
|
||||
inline uint16x4_t vmin(const uint16x4_t & v0, const uint16x4_t & v1) { return vmin_u16(v0, v1); }
|
||||
inline int16x4_t vmin(const int16x4_t & v0, const int16x4_t & v1) { return vmin_s16(v0, v1); }
|
||||
inline uint32x2_t vmin(const uint32x2_t & v0, const uint32x2_t & v1) { return vmin_u32(v0, v1); }
|
||||
inline int32x2_t vmin(const int32x2_t & v0, const int32x2_t & v1) { return vmin_s32(v0, v1); }
|
||||
inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vmaxq ///////////////////////
|
||||
|
||||
inline uint8x16_t vmaxq(const uint8x16_t & v0, const uint8x16_t & v1) { return vmaxq_u8 (v0, v1); }
|
||||
inline int8x16_t vmaxq(const int8x16_t & v0, const int8x16_t & v1) { return vmaxq_s8 (v0, v1); }
|
||||
inline uint16x8_t vmaxq(const uint16x8_t & v0, const uint16x8_t & v1) { return vmaxq_u16(v0, v1); }
|
||||
inline int16x8_t vmaxq(const int16x8_t & v0, const int16x8_t & v1) { return vmaxq_s16(v0, v1); }
|
||||
inline uint32x4_t vmaxq(const uint32x4_t & v0, const uint32x4_t & v1) { return vmaxq_u32(v0, v1); }
|
||||
inline int32x4_t vmaxq(const int32x4_t & v0, const int32x4_t & v1) { return vmaxq_s32(v0, v1); }
|
||||
inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vmax ///////////////////////
|
||||
|
||||
inline uint8x8_t vmax(const uint8x8_t & v0, const uint8x8_t & v1) { return vmax_u8 (v0, v1); }
|
||||
inline int8x8_t vmax(const int8x8_t & v0, const int8x8_t & v1) { return vmax_s8 (v0, v1); }
|
||||
inline uint16x4_t vmax(const uint16x4_t & v0, const uint16x4_t & v1) { return vmax_u16(v0, v1); }
|
||||
inline int16x4_t vmax(const int16x4_t & v0, const int16x4_t & v1) { return vmax_s16(v0, v1); }
|
||||
inline uint32x2_t vmax(const uint32x2_t & v0, const uint32x2_t & v1) { return vmax_u32(v0, v1); }
|
||||
inline int32x2_t vmax(const int32x2_t & v0, const int32x2_t & v1) { return vmax_s32(v0, v1); }
|
||||
inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vdupq_n ///////////////////////
|
||||
|
||||
inline uint8x16_t vdupq_n(const u8 & val) { return vdupq_n_u8(val); }
|
||||
inline int8x16_t vdupq_n(const s8 & val) { return vdupq_n_s8(val); }
|
||||
inline uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
|
||||
inline int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); }
|
||||
inline uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
|
||||
inline int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); }
|
||||
inline uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }
|
||||
inline int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); }
|
||||
inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); }
|
||||
|
||||
////////////////////////////// vdup_n ///////////////////////
|
||||
|
||||
inline uint8x8_t vdup_n(const u8 & val) { return vdup_n_u8(val); }
|
||||
inline int8x8_t vdup_n(const s8 & val) { return vdup_n_s8(val); }
|
||||
inline uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
|
||||
inline int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
|
||||
inline uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); }
|
||||
inline int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
|
||||
inline uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); }
|
||||
inline int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); }
|
||||
inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); }
|
||||
|
||||
////////////////////////////// vget_low ///////////////////////
|
||||
|
||||
inline uint8x8_t vget_low(const uint8x16_t & v) { return vget_low_u8 (v); }
|
||||
inline int8x8_t vget_low(const int8x16_t & v) { return vget_low_s8 (v); }
|
||||
inline uint16x4_t vget_low(const uint16x8_t & v) { return vget_low_u16(v); }
|
||||
inline int16x4_t vget_low(const int16x8_t & v) { return vget_low_s16(v); }
|
||||
inline uint32x2_t vget_low(const uint32x4_t & v) { return vget_low_u32(v); }
|
||||
inline int32x2_t vget_low(const int32x4_t & v) { return vget_low_s32(v); }
|
||||
inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); }
|
||||
|
||||
////////////////////////////// vget_high ///////////////////////
|
||||
|
||||
inline uint8x8_t vget_high(const uint8x16_t & v) { return vget_high_u8 (v); }
|
||||
inline int8x8_t vget_high(const int8x16_t & v) { return vget_high_s8 (v); }
|
||||
inline uint16x4_t vget_high(const uint16x8_t & v) { return vget_high_u16(v); }
|
||||
inline int16x4_t vget_high(const int16x8_t & v) { return vget_high_s16(v); }
|
||||
inline uint32x2_t vget_high(const uint32x4_t & v) { return vget_high_u32(v); }
|
||||
inline int32x2_t vget_high(const int32x4_t & v) { return vget_high_s32(v); }
|
||||
inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); }
|
||||
|
||||
////////////////////////////// vcombine ///////////////////////
|
||||
|
||||
inline uint8x16_t vcombine(const uint8x8_t & v0, const uint8x8_t & v1) { return vcombine_u8 (v0, v1); }
|
||||
inline int8x16_t vcombine(const int8x8_t & v0, const int8x8_t & v1) { return vcombine_s8 (v0, v1); }
|
||||
inline uint16x8_t vcombine(const uint16x4_t & v0, const uint16x4_t & v1) { return vcombine_u16(v0, v1); }
|
||||
inline int16x8_t vcombine(const int16x4_t & v0, const int16x4_t & v1) { return vcombine_s16(v0, v1); }
|
||||
inline uint32x4_t vcombine(const uint32x2_t & v0, const uint32x2_t & v1) { return vcombine_u32(v0, v1); }
|
||||
inline int32x4_t vcombine(const int32x2_t & v0, const int32x2_t & v1) { return vcombine_s32(v0, v1); }
|
||||
inline float32x4_t vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vaddq ///////////////////////
|
||||
|
||||
inline uint8x16_t vaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vaddq_u8 (v0, v1); }
|
||||
inline int8x16_t vaddq(const int8x16_t & v0, const int8x16_t & v1) { return vaddq_s8 (v0, v1); }
|
||||
inline uint16x8_t vaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vaddq_u16(v0, v1); }
|
||||
inline int16x8_t vaddq(const int16x8_t & v0, const int16x8_t & v1) { return vaddq_s16(v0, v1); }
|
||||
inline uint32x4_t vaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vaddq_u32(v0, v1); }
|
||||
inline int32x4_t vaddq(const int32x4_t & v0, const int32x4_t & v1) { return vaddq_s32(v0, v1); }
|
||||
inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vadd ///////////////////////
|
||||
|
||||
inline uint8x8_t vadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vadd_u8 (v0, v1); }
|
||||
inline int8x8_t vadd(const int8x8_t & v0, const int8x8_t & v1) { return vadd_s8 (v0, v1); }
|
||||
inline uint16x4_t vadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vadd_u16(v0, v1); }
|
||||
inline int16x4_t vadd(const int16x4_t & v0, const int16x4_t & v1) { return vadd_s16(v0, v1); }
|
||||
inline uint32x2_t vadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vadd_u32(v0, v1); }
|
||||
inline int32x2_t vadd(const int32x2_t & v0, const int32x2_t & v1) { return vadd_s32(v0, v1); }
|
||||
inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vqaddq ///////////////////////
|
||||
|
||||
inline uint8x16_t vqaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqaddq_u8 (v0, v1); }
|
||||
inline int8x16_t vqaddq(const int8x16_t & v0, const int8x16_t & v1) { return vqaddq_s8 (v0, v1); }
|
||||
inline uint16x8_t vqaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqaddq_u16(v0, v1); }
|
||||
inline int16x8_t vqaddq(const int16x8_t & v0, const int16x8_t & v1) { return vqaddq_s16(v0, v1); }
|
||||
inline uint32x4_t vqaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqaddq_u32(v0, v1); }
|
||||
inline int32x4_t vqaddq(const int32x4_t & v0, const int32x4_t & v1) { return vqaddq_s32(v0, v1); }
|
||||
|
||||
////////////////////////////// vqadd ///////////////////////
|
||||
|
||||
inline uint8x8_t vqadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vqadd_u8 (v0, v1); }
|
||||
inline int8x8_t vqadd(const int8x8_t & v0, const int8x8_t & v1) { return vqadd_s8 (v0, v1); }
|
||||
inline uint16x4_t vqadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vqadd_u16(v0, v1); }
|
||||
inline int16x4_t vqadd(const int16x4_t & v0, const int16x4_t & v1) { return vqadd_s16(v0, v1); }
|
||||
inline uint32x2_t vqadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vqadd_u32(v0, v1); }
|
||||
inline int32x2_t vqadd(const int32x2_t & v0, const int32x2_t & v1) { return vqadd_s32(v0, v1); }
|
||||
|
||||
////////////////////////////// vsubq ///////////////////////
|
||||
|
||||
inline uint8x16_t vsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vsubq_u8 (v0, v1); }
|
||||
inline int8x16_t vsubq(const int8x16_t & v0, const int8x16_t & v1) { return vsubq_s8 (v0, v1); }
|
||||
inline uint16x8_t vsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vsubq_u16(v0, v1); }
|
||||
inline int16x8_t vsubq(const int16x8_t & v0, const int16x8_t & v1) { return vsubq_s16(v0, v1); }
|
||||
inline uint32x4_t vsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vsubq_u32(v0, v1); }
|
||||
inline int32x4_t vsubq(const int32x4_t & v0, const int32x4_t & v1) { return vsubq_s32(v0, v1); }
|
||||
inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vsub ///////////////////////
|
||||
|
||||
inline uint8x8_t vsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vsub_u8 (v0, v1); }
|
||||
inline int8x8_t vsub(const int8x8_t & v0, const int8x8_t & v1) { return vsub_s8 (v0, v1); }
|
||||
inline uint16x4_t vsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vsub_u16(v0, v1); }
|
||||
inline int16x4_t vsub(const int16x4_t & v0, const int16x4_t & v1) { return vsub_s16(v0, v1); }
|
||||
inline uint32x2_t vsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vsub_u32(v0, v1); }
|
||||
inline int32x2_t vsub(const int32x2_t & v0, const int32x2_t & v1) { return vsub_s32(v0, v1); }
|
||||
inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vqsubq ///////////////////////
|
||||
|
||||
inline uint8x16_t vqsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqsubq_u8 (v0, v1); }
|
||||
inline int8x16_t vqsubq(const int8x16_t & v0, const int8x16_t & v1) { return vqsubq_s8 (v0, v1); }
|
||||
inline uint16x8_t vqsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqsubq_u16(v0, v1); }
|
||||
inline int16x8_t vqsubq(const int16x8_t & v0, const int16x8_t & v1) { return vqsubq_s16(v0, v1); }
|
||||
inline uint32x4_t vqsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqsubq_u32(v0, v1); }
|
||||
inline int32x4_t vqsubq(const int32x4_t & v0, const int32x4_t & v1) { return vqsubq_s32(v0, v1); }
|
||||
inline uint64x2_t vqsubq(const uint64x2_t & v0, const uint64x2_t & v1) { return vqsubq_u64(v0, v1); }
|
||||
inline int64x2_t vqsubq(const int64x2_t & v0, const int64x2_t & v1) { return vqsubq_s64(v0, v1); }
|
||||
|
||||
////////////////////////////// vqsub ///////////////////////
|
||||
|
||||
inline uint8x8_t vqsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vqsub_u8 (v0, v1); }
|
||||
inline int8x8_t vqsub(const int8x8_t & v0, const int8x8_t & v1) { return vqsub_s8 (v0, v1); }
|
||||
inline uint16x4_t vqsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vqsub_u16(v0, v1); }
|
||||
inline int16x4_t vqsub(const int16x4_t & v0, const int16x4_t & v1) { return vqsub_s16(v0, v1); }
|
||||
inline uint32x2_t vqsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vqsub_u32(v0, v1); }
|
||||
inline int32x2_t vqsub(const int32x2_t & v0, const int32x2_t & v1) { return vqsub_s32(v0, v1); }
|
||||
inline uint64x1_t vqsub(const uint64x1_t & v0, const uint64x1_t & v1) { return vqsub_u64(v0, v1); }
|
||||
inline int64x1_t vqsub(const int64x1_t & v0, const int64x1_t & v1) { return vqsub_s64(v0, v1); }
|
||||
|
||||
////////////////////////////// vmull ///////////////////////
|
||||
|
||||
inline uint16x8_t vmull(const uint8x8_t & v0, const uint8x8_t & v1) { return vmull_u8 (v0, v1); }
|
||||
inline int16x8_t vmull(const int8x8_t & v0, const int8x8_t & v1) { return vmull_s8 (v0, v1); }
|
||||
inline uint32x4_t vmull(const uint16x4_t & v0, const uint16x4_t & v1) { return vmull_u16(v0, v1); }
|
||||
inline int32x4_t vmull(const int16x4_t & v0, const int16x4_t & v1) { return vmull_s16(v0, v1); }
|
||||
inline uint64x2_t vmull(const uint32x2_t & v0, const uint32x2_t & v1) { return vmull_u32(v0, v1); }
|
||||
inline int64x2_t vmull(const int32x2_t & v0, const int32x2_t & v1) { return vmull_s32(v0, v1); }
|
||||
|
||||
////////////////////////////// vrev64q ///////////////////////
|
||||
|
||||
inline uint8x16_t vrev64q(const uint8x16_t & v) { return vrev64q_u8 (v); }
|
||||
inline int8x16_t vrev64q(const int8x16_t & v) { return vrev64q_s8 (v); }
|
||||
inline uint16x8_t vrev64q(const uint16x8_t & v) { return vrev64q_u16(v); }
|
||||
inline int16x8_t vrev64q(const int16x8_t & v) { return vrev64q_s16(v); }
|
||||
inline uint32x4_t vrev64q(const uint32x4_t & v) { return vrev64q_u32(v); }
|
||||
inline int32x4_t vrev64q(const int32x4_t & v) { return vrev64q_s32(v); }
|
||||
inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); }
|
||||
|
||||
////////////////////////////// vrev64 ///////////////////////
|
||||
|
||||
inline uint8x8_t vrev64(const uint8x8_t & v) { return vrev64_u8 (v); }
|
||||
inline int8x8_t vrev64(const int8x8_t & v) { return vrev64_s8 (v); }
|
||||
inline uint16x4_t vrev64(const uint16x4_t & v) { return vrev64_u16(v); }
|
||||
inline int16x4_t vrev64(const int16x4_t & v) { return vrev64_s16(v); }
|
||||
inline uint32x2_t vrev64(const uint32x2_t & v) { return vrev64_u32(v); }
|
||||
inline int32x2_t vrev64(const int32x2_t & v) { return vrev64_s32(v); }
|
||||
inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); }
|
||||
|
||||
////////////////////////////// vceqq ///////////////////////
|
||||
|
||||
inline uint8x16_t vceqq(const uint8x16_t & v0, const uint8x16_t & v1) { return vceqq_u8 (v0, v1); }
|
||||
inline uint8x16_t vceqq(const int8x16_t & v0, const int8x16_t & v1) { return vceqq_s8 (v0, v1); }
|
||||
inline uint16x8_t vceqq(const uint16x8_t & v0, const uint16x8_t & v1) { return vceqq_u16(v0, v1); }
|
||||
inline uint16x8_t vceqq(const int16x8_t & v0, const int16x8_t & v1) { return vceqq_s16(v0, v1); }
|
||||
inline uint32x4_t vceqq(const uint32x4_t & v0, const uint32x4_t & v1) { return vceqq_u32(v0, v1); }
|
||||
inline uint32x4_t vceqq(const int32x4_t & v0, const int32x4_t & v1) { return vceqq_s32(v0, v1); }
|
||||
inline uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vceq ///////////////////////
|
||||
|
||||
inline uint8x8_t vceq(const uint8x8_t & v0, const uint8x8_t & v1) { return vceq_u8 (v0, v1); }
|
||||
inline uint8x8_t vceq(const int8x8_t & v0, const int8x8_t & v1) { return vceq_s8 (v0, v1); }
|
||||
inline uint16x4_t vceq(const uint16x4_t & v0, const uint16x4_t & v1) { return vceq_u16(v0, v1); }
|
||||
inline uint16x4_t vceq(const int16x4_t & v0, const int16x4_t & v1) { return vceq_s16(v0, v1); }
|
||||
inline uint32x2_t vceq(const uint32x2_t & v0, const uint32x2_t & v1) { return vceq_u32(v0, v1); }
|
||||
inline uint32x2_t vceq(const int32x2_t & v0, const int32x2_t & v1) { return vceq_s32(v0, v1); }
|
||||
inline uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vcgtq ///////////////////////
|
||||
|
||||
inline uint8x16_t vcgtq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgtq_u8 (v0, v1); }
|
||||
inline uint8x16_t vcgtq(const int8x16_t & v0, const int8x16_t & v1) { return vcgtq_s8 (v0, v1); }
|
||||
inline uint16x8_t vcgtq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgtq_u16(v0, v1); }
|
||||
inline uint16x8_t vcgtq(const int16x8_t & v0, const int16x8_t & v1) { return vcgtq_s16(v0, v1); }
|
||||
inline uint32x4_t vcgtq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgtq_u32(v0, v1); }
|
||||
inline uint32x4_t vcgtq(const int32x4_t & v0, const int32x4_t & v1) { return vcgtq_s32(v0, v1); }
|
||||
inline uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vcgt ///////////////////////
|
||||
|
||||
inline uint8x8_t vcgt(const uint8x8_t & v0, const uint8x8_t & v1) { return vcgt_u8 (v0, v1); }
|
||||
inline uint8x8_t vcgt(const int8x8_t & v0, const int8x8_t & v1) { return vcgt_s8 (v0, v1); }
|
||||
inline uint16x4_t vcgt(const uint16x4_t & v0, const uint16x4_t & v1) { return vcgt_u16(v0, v1); }
|
||||
inline uint16x4_t vcgt(const int16x4_t & v0, const int16x4_t & v1) { return vcgt_s16(v0, v1); }
|
||||
inline uint32x2_t vcgt(const uint32x2_t & v0, const uint32x2_t & v1) { return vcgt_u32(v0, v1); }
|
||||
inline uint32x2_t vcgt(const int32x2_t & v0, const int32x2_t & v1) { return vcgt_s32(v0, v1); }
|
||||
inline uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vcgeq ///////////////////////
|
||||
|
||||
inline uint8x16_t vcgeq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgeq_u8 (v0, v1); }
|
||||
inline uint8x16_t vcgeq(const int8x16_t & v0, const int8x16_t & v1) { return vcgeq_s8 (v0, v1); }
|
||||
inline uint16x8_t vcgeq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgeq_u16(v0, v1); }
|
||||
inline uint16x8_t vcgeq(const int16x8_t & v0, const int16x8_t & v1) { return vcgeq_s16(v0, v1); }
|
||||
inline uint32x4_t vcgeq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgeq_u32(v0, v1); }
|
||||
inline uint32x4_t vcgeq(const int32x4_t & v0, const int32x4_t & v1) { return vcgeq_s32(v0, v1); }
|
||||
inline uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vcge ///////////////////////
|
||||
|
||||
inline uint8x8_t vcge(const uint8x8_t & v0, const uint8x8_t & v1) { return vcge_u8 (v0, v1); }
|
||||
inline uint8x8_t vcge(const int8x8_t & v0, const int8x8_t & v1) { return vcge_s8 (v0, v1); }
|
||||
inline uint16x4_t vcge(const uint16x4_t & v0, const uint16x4_t & v1) { return vcge_u16(v0, v1); }
|
||||
inline uint16x4_t vcge(const int16x4_t & v0, const int16x4_t & v1) { return vcge_s16(v0, v1); }
|
||||
inline uint32x2_t vcge(const uint32x2_t & v0, const uint32x2_t & v1) { return vcge_u32(v0, v1); }
|
||||
inline uint32x2_t vcge(const int32x2_t & v0, const int32x2_t & v1) { return vcge_s32(v0, v1); }
|
||||
inline uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); }
|
||||
|
||||
////////////////////////////// vandq ///////////////////////
|
||||
|
||||
inline uint8x16_t vandq(const uint8x16_t & v0, const uint8x16_t & v1) { return vandq_u8 (v0, v1); }
|
||||
inline int8x16_t vandq(const int8x16_t & v0, const int8x16_t & v1) { return vandq_s8 (v0, v1); }
|
||||
inline uint16x8_t vandq(const uint16x8_t & v0, const uint16x8_t & v1) { return vandq_u16(v0, v1); }
|
||||
inline int16x8_t vandq(const int16x8_t & v0, const int16x8_t & v1) { return vandq_s16(v0, v1); }
|
||||
inline uint32x4_t vandq(const uint32x4_t & v0, const uint32x4_t & v1) { return vandq_u32(v0, v1); }
|
||||
inline int32x4_t vandq(const int32x4_t & v0, const int32x4_t & v1) { return vandq_s32(v0, v1); }
|
||||
|
||||
////////////////////////////// vand ///////////////////////
|
||||
|
||||
inline uint8x8_t vand(const uint8x8_t & v0, const uint8x8_t & v1) { return vand_u8 (v0, v1); }
|
||||
inline int8x8_t vand(const int8x8_t & v0, const int8x8_t & v1) { return vand_s8 (v0, v1); }
|
||||
inline uint16x4_t vand(const uint16x4_t & v0, const uint16x4_t & v1) { return vand_u16(v0, v1); }
|
||||
inline int16x4_t vand(const int16x4_t & v0, const int16x4_t & v1) { return vand_s16(v0, v1); }
|
||||
inline uint32x2_t vand(const uint32x2_t & v0, const uint32x2_t & v1) { return vand_u32(v0, v1); }
|
||||
inline int32x2_t vand(const int32x2_t & v0, const int32x2_t & v1) { return vand_s32(v0, v1); }
|
||||
|
||||
////////////////////////////// vmovn ///////////////////////
|
||||
|
||||
inline uint8x8_t vmovn(const uint16x8_t & v) { return vmovn_u16(v); }
|
||||
inline int8x8_t vmovn(const int16x8_t & v) { return vmovn_s16(v); }
|
||||
inline uint16x4_t vmovn(const uint32x4_t & v) { return vmovn_u32(v); }
|
||||
inline int16x4_t vmovn(const int32x4_t & v) { return vmovn_s32(v); }
|
||||
inline uint32x2_t vmovn(const uint64x2_t & v) { return vmovn_u64(v); }
|
||||
inline int32x2_t vmovn(const int64x2_t & v) { return vmovn_s64(v); }
|
||||
|
||||
////////////////////////////// vqmovn ///////////////////////
|
||||
|
||||
inline uint8x8_t vqmovn(const uint16x8_t & v) { return vqmovn_u16(v); }
|
||||
inline int8x8_t vqmovn(const int16x8_t & v) { return vqmovn_s16(v); }
|
||||
inline uint16x4_t vqmovn(const uint32x4_t & v) { return vqmovn_u32(v); }
|
||||
inline int16x4_t vqmovn(const int32x4_t & v) { return vqmovn_s32(v); }
|
||||
inline uint32x2_t vqmovn(const uint64x2_t & v) { return vqmovn_u64(v); }
|
||||
inline int32x2_t vqmovn(const int64x2_t & v) { return vqmovn_s64(v); }
|
||||
|
||||
////////////////////////////// vmovl ///////////////////////
|
||||
|
||||
inline uint16x8_t vmovl(const uint8x8_t & v) { return vmovl_u8(v); }
|
||||
inline int16x8_t vmovl(const int8x8_t & v) { return vmovl_s8(v); }
|
||||
inline uint32x4_t vmovl(const uint16x4_t & v) { return vmovl_u16(v); }
|
||||
inline int32x4_t vmovl(const int16x4_t & v) { return vmovl_s16(v); }
|
||||
|
||||
////////////////////////////// vmvnq ///////////////////////
|
||||
|
||||
inline uint8x16_t vmvnq(const uint8x16_t & v) { return vmvnq_u8 (v); }
|
||||
inline int8x16_t vmvnq(const int8x16_t & v) { return vmvnq_s8 (v); }
|
||||
inline uint16x8_t vmvnq(const uint16x8_t & v) { return vmvnq_u16(v); }
|
||||
inline int16x8_t vmvnq(const int16x8_t & v) { return vmvnq_s16(v); }
|
||||
inline uint32x4_t vmvnq(const uint32x4_t & v) { return vmvnq_u32(v); }
|
||||
inline int32x4_t vmvnq(const int32x4_t & v) { return vmvnq_s32(v); }
|
||||
|
||||
////////////////////////////// vmvn ///////////////////////
|
||||
|
||||
inline uint8x8_t vmvn(const uint8x8_t & v) { return vmvn_u8 (v); }
|
||||
inline int8x8_t vmvn(const int8x8_t & v) { return vmvn_s8 (v); }
|
||||
inline uint16x4_t vmvn(const uint16x4_t & v) { return vmvn_u16(v); }
|
||||
inline int16x4_t vmvn(const int16x4_t & v) { return vmvn_s16(v); }
|
||||
inline uint32x2_t vmvn(const uint32x2_t & v) { return vmvn_u32(v); }
|
||||
inline int32x2_t vmvn(const int32x2_t & v) { return vmvn_s32(v); }
|
||||
|
||||
////////////////////////////// vbicq ///////////////////////
|
||||
|
||||
inline uint8x16_t vbicq(const uint8x16_t & v0, const uint8x16_t & v1) { return vbicq_u8 (v0, v1); }
|
||||
inline int8x16_t vbicq(const int8x16_t & v0, const int8x16_t & v1) { return vbicq_s8 (v0, v1); }
|
||||
inline uint16x8_t vbicq(const uint16x8_t & v0, const uint16x8_t & v1) { return vbicq_u16(v0, v1); }
|
||||
inline int16x8_t vbicq(const int16x8_t & v0, const int16x8_t & v1) { return vbicq_s16(v0, v1); }
|
||||
inline uint32x4_t vbicq(const uint32x4_t & v0, const uint32x4_t & v1) { return vbicq_u32(v0, v1); }
|
||||
inline int32x4_t vbicq(const int32x4_t & v0, const int32x4_t & v1) { return vbicq_s32(v0, v1); }
|
||||
inline uint64x2_t vbicq(const uint64x2_t & v0, const uint64x2_t & v1) { return vbicq_u64(v0, v1); }
|
||||
inline int64x2_t vbicq(const int64x2_t & v0, const int64x2_t & v1) { return vbicq_s64(v0, v1); }
|
||||
|
||||
////////////////////////////// vbic ///////////////////////
|
||||
|
||||
inline uint8x8_t vbic(const uint8x8_t & v0, const uint8x8_t & v1) { return vbic_u8 (v0, v1); }
|
||||
inline int8x8_t vbic(const int8x8_t & v0, const int8x8_t & v1) { return vbic_s8 (v0, v1); }
|
||||
inline uint16x4_t vbic(const uint16x4_t & v0, const uint16x4_t & v1) { return vbic_u16(v0, v1); }
|
||||
inline int16x4_t vbic(const int16x4_t & v0, const int16x4_t & v1) { return vbic_s16(v0, v1); }
|
||||
inline uint32x2_t vbic(const uint32x2_t & v0, const uint32x2_t & v1) { return vbic_u32(v0, v1); }
|
||||
inline int32x2_t vbic(const int32x2_t & v0, const int32x2_t & v1) { return vbic_s32(v0, v1); }
|
||||
inline uint64x1_t vbic(const uint64x1_t & v0, const uint64x1_t & v1) { return vbic_u64(v0, v1); }
|
||||
inline int64x1_t vbic(const int64x1_t & v0, const int64x1_t & v1) { return vbic_s64(v0, v1); }
|
||||
|
||||
////////////////////////////// vtransform ///////////////////////
|
||||
|
||||
template <typename Op>
|
||||
void vtransform(Size2D size,
|
||||
const typename Op::type * src0Base, ptrdiff_t src0Stride,
|
||||
const typename Op::type * src1Base, ptrdiff_t src1Stride,
|
||||
typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op)
|
||||
{
|
||||
typedef typename Op::type type;
|
||||
typedef typename VecTraits<type>::vec128 vec128;
|
||||
typedef typename VecTraits<type>::vec64 vec64;
|
||||
|
||||
if (src0Stride == src1Stride && src0Stride == dstStride &&
|
||||
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
|
||||
{
|
||||
size.width *= size.height;
|
||||
size.height = 1;
|
||||
}
|
||||
|
||||
const size_t step_base = 32 / sizeof(type);
|
||||
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
|
||||
const size_t step_tail = 8 / sizeof(type);
|
||||
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
|
||||
|
||||
for (size_t y = 0; y < size.height; ++y)
|
||||
{
|
||||
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
|
||||
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
|
||||
typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y);
|
||||
size_t x = 0;
|
||||
|
||||
for( ; x < roiw_base; x += step_base )
|
||||
{
|
||||
internal::prefetch(src0 + x);
|
||||
internal::prefetch(src1 + x);
|
||||
|
||||
vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type));
|
||||
vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type));
|
||||
vec128 v_dst;
|
||||
|
||||
op(v_src00, v_src10, v_dst);
|
||||
vst1q(dst + x, v_dst);
|
||||
|
||||
op(v_src01, v_src11, v_dst);
|
||||
vst1q(dst + x + 16 / sizeof(type), v_dst);
|
||||
}
|
||||
for( ; x < roiw_tail; x += step_tail )
|
||||
{
|
||||
vec64 v_src0 = vld1(src0 + x);
|
||||
vec64 v_src1 = vld1(src1 + x);
|
||||
vec64 v_dst;
|
||||
|
||||
op(v_src0, v_src1, v_dst);
|
||||
vst1(dst + x, v_dst);
|
||||
}
|
||||
|
||||
for (; x < size.width; ++x)
|
||||
{
|
||||
op(src0 + x, src1 + x, dst + x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} }
|
||||
|
||||
#endif // CAROTENE_NEON
|
||||
|
||||
#endif
|
434
3rdparty/carotene/src/warp_affine.cpp
vendored
Normal file
434
3rdparty/carotene/src/warp_affine.cpp
vendored
Normal file
@ -0,0 +1,434 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
#include "remap.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isWarpAffineNearestNeighborSupported(const Size2D &ssize)
|
||||
{
|
||||
#if SIZE_MAX > UINT32_MAX
|
||||
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
|
||||
// is performed with u32
|
||||
isSupportedConfiguration();
|
||||
#else
|
||||
(void)ssize;
|
||||
return isSupportedConfiguration();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool isWarpAffineLinearSupported(const Size2D &ssize)
|
||||
{
|
||||
#if SIZE_MAX > UINT32_MAX
|
||||
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
|
||||
// is performed with u32
|
||||
isSupportedConfiguration();
|
||||
#else
|
||||
(void)ssize;
|
||||
return isSupportedConfiguration();
|
||||
#endif
|
||||
}
|
||||
|
||||
void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
const f32 * m,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE borderMode, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize));
|
||||
#ifdef CAROTENE_NEON
|
||||
using namespace internal;
|
||||
|
||||
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
|
||||
s32 * map = alignPtr(_map, 16);
|
||||
|
||||
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
|
||||
int32x4_t v_step4 = vdupq_n_s32(srcStride);
|
||||
float32x4_t v_4 = vdupq_n_f32(4.0f);
|
||||
|
||||
float32x4_t v_m0 = vdupq_n_f32(m[0]);
|
||||
float32x4_t v_m1 = vdupq_n_f32(m[1]);
|
||||
float32x4_t v_m2 = vdupq_n_f32(m[2]);
|
||||
float32x4_t v_m3 = vdupq_n_f32(m[3]);
|
||||
float32x4_t v_m4 = vdupq_n_f32(m[4]);
|
||||
float32x4_t v_m5 = vdupq_n_f32(m[5]);
|
||||
|
||||
if (borderMode == BORDER_MODE_REPLICATE)
|
||||
{
|
||||
int32x4_t v_zero4 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
|
||||
|
||||
size_t x = 0, y_ = y + i;
|
||||
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
|
||||
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
|
||||
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
|
||||
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
|
||||
|
||||
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
|
||||
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
|
||||
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
|
||||
vst1q_s32(map_row + x, v_src_index);
|
||||
|
||||
v_x = vaddq_f32(v_x, v_4);
|
||||
}
|
||||
|
||||
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
|
||||
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
|
||||
{
|
||||
f32 src_x_f = m[0] * x_ + yx;
|
||||
f32 src_y_f = m[1] * x_ + yy;
|
||||
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
|
||||
|
||||
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
|
||||
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
|
||||
map_row[x] = src_y * srcStride + src_x;
|
||||
}
|
||||
}
|
||||
|
||||
// make remap
|
||||
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (borderMode == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
int32x4_t v_m1_4 = vdupq_n_s32(-1);
|
||||
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
|
||||
|
||||
size_t x = 0, y_ = y + i;
|
||||
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
|
||||
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
|
||||
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
|
||||
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
|
||||
|
||||
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
|
||||
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
|
||||
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
|
||||
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
|
||||
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
|
||||
vst1q_s32(map_row + x, v_src_index);
|
||||
|
||||
v_x = vaddq_f32(v_x, v_4);
|
||||
}
|
||||
|
||||
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
|
||||
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
|
||||
{
|
||||
f32 src_x_f = m[0] * x_ + yx;
|
||||
f32 src_y_f = m[1] * x_ + yy;
|
||||
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
|
||||
|
||||
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
|
||||
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
|
||||
}
|
||||
}
|
||||
|
||||
// make remap
|
||||
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)ssize;
|
||||
(void)dsize;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)m;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderMode;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
void warpAffineLinear(const Size2D &ssize, const Size2D &dsize,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
const f32 * m,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE borderMode, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize));
|
||||
#ifdef CAROTENE_NEON
|
||||
using namespace internal;
|
||||
|
||||
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
|
||||
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
|
||||
s32 * map = alignPtr(_map, 16);
|
||||
f32 * coeffs = alignPtr(_coeffs, 16);
|
||||
|
||||
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
|
||||
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
|
||||
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
|
||||
|
||||
float32x4_t v_m0 = vdupq_n_f32(m[0]);
|
||||
float32x4_t v_m1 = vdupq_n_f32(m[1]);
|
||||
float32x4_t v_m2 = vdupq_n_f32(m[2]);
|
||||
float32x4_t v_m3 = vdupq_n_f32(m[3]);
|
||||
float32x4_t v_m4 = vdupq_n_f32(m[4]);
|
||||
float32x4_t v_m5 = vdupq_n_f32(m[5]);
|
||||
|
||||
if (borderMode == BORDER_MODE_REPLICATE)
|
||||
{
|
||||
int32x4_t v_zero4 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
|
||||
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
|
||||
|
||||
size_t x = 0, y_ = y + i;
|
||||
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
|
||||
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
|
||||
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
|
||||
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
|
||||
|
||||
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
|
||||
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
|
||||
|
||||
float32x4x2_t v_coeff;
|
||||
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
|
||||
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
|
||||
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
|
||||
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
|
||||
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
|
||||
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
|
||||
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
|
||||
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
|
||||
|
||||
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
|
||||
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
|
||||
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
|
||||
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
|
||||
|
||||
int32x4x4_t v_dst_index;
|
||||
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
|
||||
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
|
||||
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
|
||||
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
|
||||
|
||||
vst2q_f32(coeff_row + (x << 1), v_coeff);
|
||||
vst4q_s32(map_row + (x << 2), v_dst_index);
|
||||
|
||||
v_x = vaddq_f32(v_x, v_4);
|
||||
}
|
||||
|
||||
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
|
||||
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
|
||||
{
|
||||
f32 src_x_f = m[0] * x_ + yx;
|
||||
f32 src_y_f = m[1] * x_ + yy;
|
||||
|
||||
s32 src0_x = (s32)floorf(src_x_f);
|
||||
s32 src0_y = (s32)floorf(src_y_f);
|
||||
|
||||
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
|
||||
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
|
||||
|
||||
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
|
||||
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
|
||||
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
|
||||
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
|
||||
|
||||
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
|
||||
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
|
||||
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
|
||||
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
|
||||
}
|
||||
}
|
||||
|
||||
remapLinearReplicate(Size2D(blockWidth, blockHeight),
|
||||
srcBase, &map[0], &coeffs[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (borderMode == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
|
||||
int32x4_t v_m1_4 = vdupq_n_s32(-1);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
|
||||
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
|
||||
|
||||
size_t x = 0, y_ = y + i;
|
||||
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
|
||||
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
|
||||
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
|
||||
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
|
||||
|
||||
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
|
||||
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
|
||||
|
||||
float32x4x2_t v_coeff;
|
||||
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
|
||||
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
|
||||
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
|
||||
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
|
||||
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
|
||||
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
|
||||
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
|
||||
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
|
||||
|
||||
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
|
||||
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
|
||||
|
||||
int32x4x4_t v_dst_index;
|
||||
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
|
||||
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
|
||||
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
|
||||
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
|
||||
|
||||
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
|
||||
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
|
||||
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
|
||||
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
|
||||
|
||||
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
|
||||
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
|
||||
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
|
||||
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
|
||||
|
||||
vst2q_f32(coeff_row + (x << 1), v_coeff);
|
||||
vst4q_s32(map_row + (x << 2), v_dst_index);
|
||||
|
||||
v_x = vaddq_f32(v_x, v_4);
|
||||
}
|
||||
|
||||
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
|
||||
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
|
||||
{
|
||||
f32 src_x_f = m[0] * x_ + yx;
|
||||
f32 src_y_f = m[1] * x_ + yy;
|
||||
|
||||
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
|
||||
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
|
||||
|
||||
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
|
||||
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
|
||||
|
||||
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
|
||||
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
|
||||
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
|
||||
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
|
||||
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
|
||||
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
|
||||
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
|
||||
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
|
||||
}
|
||||
}
|
||||
|
||||
remapLinearConst(Size2D(blockWidth, blockHeight),
|
||||
srcBase, &map[0], &coeffs[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)ssize;
|
||||
(void)dsize;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)m;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderMode;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
464
3rdparty/carotene/src/warp_perspective.cpp
vendored
Normal file
464
3rdparty/carotene/src/warp_perspective.cpp
vendored
Normal file
@ -0,0 +1,464 @@
|
||||
/*
|
||||
* By downloading, copying, installing or using the software you agree to this license.
|
||||
* If you do not agree to this license, do not download, install,
|
||||
* copy or use the software.
|
||||
*
|
||||
*
|
||||
* License Agreement
|
||||
* For Open Source Computer Vision Library
|
||||
* (3-clause BSD License)
|
||||
*
|
||||
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
|
||||
* Third party copyrights are property of their respective owners.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* * Neither the names of the copyright holders nor the names of the contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* This software is provided by the copyright holders and contributors "as is" and
|
||||
* any express or implied warranties, including, but not limited to, the implied
|
||||
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
* In no event shall copyright holders or contributors be liable for any direct,
|
||||
* indirect, incidental, special, exemplary, or consequential damages
|
||||
* (including, but not limited to, procurement of substitute goods or services;
|
||||
* loss of use, data, or profits; or business interruption) however caused
|
||||
* and on any theory of liability, whether in contract, strict liability,
|
||||
* or tort (including negligence or otherwise) arising in any way out of
|
||||
* the use of this software, even if advised of the possibility of such damage.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include "remap.hpp"
|
||||
|
||||
namespace CAROTENE_NS {
|
||||
|
||||
bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize)
|
||||
{
|
||||
#if SIZE_MAX > UINT32_MAX
|
||||
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
|
||||
// is performed with u32
|
||||
isSupportedConfiguration();
|
||||
#else
|
||||
(void)ssize;
|
||||
return isSupportedConfiguration();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool isWarpPerspectiveLinearSupported(const Size2D &ssize)
|
||||
{
|
||||
#if SIZE_MAX > UINT32_MAX
|
||||
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
|
||||
// is performed with u32
|
||||
isSupportedConfiguration();
|
||||
#else
|
||||
(void)ssize;
|
||||
return isSupportedConfiguration();
|
||||
#endif
|
||||
}
|
||||
|
||||
void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
const f32 * m,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE borderMode, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize));
|
||||
#ifdef CAROTENE_NEON
|
||||
using namespace internal;
|
||||
|
||||
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
|
||||
s32 * map = alignPtr(_map, 16);
|
||||
|
||||
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
|
||||
int32x4_t v_step4 = vdupq_n_s32(srcStride);
|
||||
float32x4_t v_4 = vdupq_n_f32(4.0f);
|
||||
|
||||
float32x4_t v_m0 = vdupq_n_f32(m[0]);
|
||||
float32x4_t v_m1 = vdupq_n_f32(m[1]);
|
||||
float32x4_t v_m2 = vdupq_n_f32(m[2]);
|
||||
float32x4_t v_m3 = vdupq_n_f32(m[3]);
|
||||
float32x4_t v_m4 = vdupq_n_f32(m[4]);
|
||||
float32x4_t v_m5 = vdupq_n_f32(m[5]);
|
||||
float32x4_t v_m6 = vdupq_n_f32(m[6]);
|
||||
float32x4_t v_m7 = vdupq_n_f32(m[7]);
|
||||
float32x4_t v_m8 = vdupq_n_f32(m[8]);
|
||||
|
||||
if (borderMode == BORDER_MODE_REPLICATE)
|
||||
{
|
||||
int32x4_t v_zero4 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
|
||||
|
||||
size_t x = 0, y_ = y + i;
|
||||
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
|
||||
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
|
||||
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
|
||||
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
|
||||
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
|
||||
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
|
||||
v_src_xf = vmulq_f32(v_wf, v_src_xf);
|
||||
v_src_yf = vmulq_f32(v_wf, v_src_yf);
|
||||
|
||||
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
|
||||
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
|
||||
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
|
||||
vst1q_s32(map_row + x, v_src_index);
|
||||
|
||||
v_x = vaddq_f32(v_x, v_4);
|
||||
}
|
||||
|
||||
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
|
||||
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
|
||||
{
|
||||
f32 w_f = 1.0f / (m[2] * x_ + yw);
|
||||
f32 src_x_f = (m[0] * x_ + yx) * w_f;
|
||||
f32 src_y_f = (m[1] * x_ + yy) * w_f;
|
||||
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
|
||||
|
||||
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
|
||||
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
|
||||
map_row[x] = src_y * srcStride + src_x;
|
||||
}
|
||||
}
|
||||
|
||||
// make remap
|
||||
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (borderMode == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
int32x4_t v_m1_4 = vdupq_n_s32(-1);
|
||||
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
|
||||
|
||||
size_t x = 0, y_ = y + i;
|
||||
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
|
||||
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
|
||||
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
|
||||
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
|
||||
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
|
||||
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
|
||||
v_src_xf = vmulq_f32(v_wf, v_src_xf);
|
||||
v_src_yf = vmulq_f32(v_wf, v_src_yf);
|
||||
|
||||
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
|
||||
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
|
||||
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
|
||||
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
|
||||
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
|
||||
vst1q_s32(map_row + x, v_src_index);
|
||||
|
||||
v_x = vaddq_f32(v_x, v_4);
|
||||
}
|
||||
|
||||
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
|
||||
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
|
||||
{
|
||||
f32 w_f = 1.0f / (m[2] * x_ + yw);
|
||||
f32 src_x_f = (m[0] * x_ + yx) * w_f;
|
||||
f32 src_y_f = (m[1] * x_ + yy) * w_f;
|
||||
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
|
||||
|
||||
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
|
||||
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
|
||||
}
|
||||
}
|
||||
|
||||
// make remap
|
||||
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)ssize;
|
||||
(void)dsize;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)m;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderMode;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize,
|
||||
const u8 * srcBase, ptrdiff_t srcStride,
|
||||
const f32 * m,
|
||||
u8 * dstBase, ptrdiff_t dstStride,
|
||||
BORDER_MODE borderMode, u8 borderValue)
|
||||
{
|
||||
internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize));
|
||||
#ifdef CAROTENE_NEON
|
||||
using namespace internal;
|
||||
|
||||
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
|
||||
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
|
||||
s32 * map = alignPtr(_map, 16);
|
||||
f32 * coeffs = alignPtr(_coeffs, 16);
|
||||
|
||||
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
|
||||
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
|
||||
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
|
||||
|
||||
float32x4_t v_4 = vdupq_n_f32(4.0f);
|
||||
|
||||
float32x4_t v_m0 = vdupq_n_f32(m[0]);
|
||||
float32x4_t v_m1 = vdupq_n_f32(m[1]);
|
||||
float32x4_t v_m2 = vdupq_n_f32(m[2]);
|
||||
float32x4_t v_m3 = vdupq_n_f32(m[3]);
|
||||
float32x4_t v_m4 = vdupq_n_f32(m[4]);
|
||||
float32x4_t v_m5 = vdupq_n_f32(m[5]);
|
||||
float32x4_t v_m6 = vdupq_n_f32(m[6]);
|
||||
float32x4_t v_m7 = vdupq_n_f32(m[7]);
|
||||
float32x4_t v_m8 = vdupq_n_f32(m[8]);
|
||||
|
||||
if (borderMode == BORDER_MODE_REPLICATE)
|
||||
{
|
||||
int32x4_t v_zero4 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
|
||||
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
|
||||
|
||||
size_t x = 0, y_ = y + i;
|
||||
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
|
||||
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
|
||||
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
|
||||
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
|
||||
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
|
||||
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
|
||||
v_src_xf = vmulq_f32(v_wf, v_src_xf);
|
||||
v_src_yf = vmulq_f32(v_wf, v_src_yf);
|
||||
|
||||
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
|
||||
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
|
||||
|
||||
float32x4x2_t v_coeff;
|
||||
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
|
||||
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
|
||||
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
|
||||
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
|
||||
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
|
||||
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
|
||||
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
|
||||
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
|
||||
|
||||
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
|
||||
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
|
||||
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
|
||||
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
|
||||
|
||||
int32x4x4_t v_dst_index;
|
||||
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
|
||||
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
|
||||
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
|
||||
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
|
||||
|
||||
vst2q_f32(coeff_row + (x << 1), v_coeff);
|
||||
vst4q_s32(map_row + (x << 2), v_dst_index);
|
||||
|
||||
v_x = vaddq_f32(v_x, v_4);
|
||||
}
|
||||
|
||||
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
|
||||
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
|
||||
{
|
||||
f32 w_f = 1.0f / (m[2] * x_ + yw);
|
||||
f32 src_x_f = (m[0] * x_ + yx) * w_f;
|
||||
f32 src_y_f = (m[1] * x_ + yy) * w_f;
|
||||
|
||||
s32 src0_x = (s32)floorf(src_x_f);
|
||||
s32 src0_y = (s32)floorf(src_y_f);
|
||||
|
||||
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
|
||||
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
|
||||
|
||||
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
|
||||
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
|
||||
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
|
||||
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
|
||||
|
||||
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
|
||||
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
|
||||
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
|
||||
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
|
||||
}
|
||||
}
|
||||
|
||||
remapLinearReplicate(Size2D(blockWidth, blockHeight),
|
||||
srcBase, &map[0], &coeffs[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (borderMode == BORDER_MODE_CONSTANT)
|
||||
{
|
||||
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
|
||||
int32x4_t v_m1_4 = vdupq_n_s32(-1);
|
||||
|
||||
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
|
||||
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
|
||||
{
|
||||
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
|
||||
|
||||
// compute table
|
||||
for (size_t y = 0; y < blockHeight; ++y)
|
||||
{
|
||||
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
|
||||
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
|
||||
|
||||
size_t x = 0, y_ = y + i;
|
||||
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
|
||||
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
|
||||
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
|
||||
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
|
||||
|
||||
for ( ; x + 4 <= blockWidth; x += 4)
|
||||
{
|
||||
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
|
||||
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
|
||||
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
|
||||
v_src_xf = vmulq_f32(v_wf, v_src_xf);
|
||||
v_src_yf = vmulq_f32(v_wf, v_src_yf);
|
||||
|
||||
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
|
||||
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
|
||||
|
||||
float32x4x2_t v_coeff;
|
||||
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
|
||||
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
|
||||
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
|
||||
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
|
||||
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
|
||||
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
|
||||
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
|
||||
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
|
||||
|
||||
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
|
||||
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
|
||||
|
||||
int32x4x4_t v_dst_index;
|
||||
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
|
||||
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
|
||||
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
|
||||
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
|
||||
|
||||
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
|
||||
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
|
||||
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
|
||||
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
|
||||
|
||||
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
|
||||
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
|
||||
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
|
||||
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
|
||||
|
||||
vst2q_f32(coeff_row + (x << 1), v_coeff);
|
||||
vst4q_s32(map_row + (x << 2), v_dst_index);
|
||||
|
||||
v_x = vaddq_f32(v_x, v_4);
|
||||
}
|
||||
|
||||
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
|
||||
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
|
||||
{
|
||||
f32 w_f = 1.0f / (m[2] * x_ + yw);
|
||||
f32 src_x_f = (m[0] * x_ + yx) * w_f;
|
||||
f32 src_y_f = (m[1] * x_ + yy) * w_f;
|
||||
|
||||
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
|
||||
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
|
||||
|
||||
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
|
||||
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
|
||||
|
||||
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
|
||||
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
|
||||
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
|
||||
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
|
||||
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
|
||||
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
|
||||
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
|
||||
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
|
||||
}
|
||||
}
|
||||
|
||||
remapLinearConst(Size2D(blockWidth, blockHeight),
|
||||
srcBase, &map[0], &coeffs[0],
|
||||
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)ssize;
|
||||
(void)dsize;
|
||||
(void)srcBase;
|
||||
(void)srcStride;
|
||||
(void)m;
|
||||
(void)dstBase;
|
||||
(void)dstStride;
|
||||
(void)borderMode;
|
||||
(void)borderValue;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CAROTENE_NS
|
3
3rdparty/ffmpeg/.gitignore
vendored
Normal file
3
3rdparty/ffmpeg/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
downloads/
|
||||
*.dll
|
||||
ffmpeg_version.cmake
|
25
3rdparty/ffmpeg/ffmpeg.cmake
vendored
Normal file
25
3rdparty/ffmpeg/ffmpeg.cmake
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
# Binary branch name: ffmpeg/master_20160908
|
||||
# Binaries were created for OpenCV: 11a65475d8d460a01c8818c5a2d0544ec49d7d68
|
||||
set(FFMPEG_BINARIES_COMMIT "03835134465888981e066434dc95009e8328d4ea")
|
||||
set(FFMPEG_FILE_HASH_BIN32 "32ba7790b0ac7a6dc66be91603637a7d")
|
||||
set(FFMPEG_FILE_HASH_BIN64 "068ecaa459a5571e7909cff90999a420")
|
||||
set(FFMPEG_FILE_HASH_CMAKE "f99941d10c1e87bf16b9055e8fc91ab2")
|
||||
|
||||
set(FFMPEG_DOWNLOAD_URL ${OPENCV_FFMPEG_URL};$ENV{OPENCV_FFMPEG_URL};https://raw.githubusercontent.com/opencv/opencv_3rdparty/${FFMPEG_BINARIES_COMMIT}/ffmpeg/)
|
||||
|
||||
ocv_download(PACKAGE opencv_ffmpeg.dll
|
||||
HASH ${FFMPEG_FILE_HASH_BIN32}
|
||||
URL ${FFMPEG_DOWNLOAD_URL}
|
||||
DESTINATION_DIR ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
ocv_download(PACKAGE opencv_ffmpeg_64.dll
|
||||
HASH ${FFMPEG_FILE_HASH_BIN64}
|
||||
URL ${FFMPEG_DOWNLOAD_URL}
|
||||
DESTINATION_DIR ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
ocv_download(PACKAGE ffmpeg_version.cmake
|
||||
HASH ${FFMPEG_FILE_HASH_CMAKE}
|
||||
URL ${FFMPEG_DOWNLOAD_URL}
|
||||
DESTINATION_DIR ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/ffmpeg_version.cmake)
|
11
3rdparty/ffmpeg/ffmpeg_version.cmake
vendored
11
3rdparty/ffmpeg/ffmpeg_version.cmake
vendored
@ -1,11 +0,0 @@
|
||||
set(HAVE_FFMPEG 1)
|
||||
set(HAVE_FFMPEG_CODEC 1)
|
||||
set(HAVE_FFMPEG_FORMAT 1)
|
||||
set(HAVE_FFMPEG_UTIL 1)
|
||||
set(HAVE_FFMPEG_SWSCALE 1)
|
||||
set(HAVE_GENTOO_FFMPEG 1)
|
||||
|
||||
set(ALIASOF_libavcodec_VERSION 55.18.102)
|
||||
set(ALIASOF_libavformat_VERSION 55.12.100)
|
||||
set(ALIASOF_libavutil_VERSION 52.38.100)
|
||||
set(ALIASOF_libswscale_VERSION 2.3.100)
|
1
3rdparty/ffmpeg/ffopencv.c
vendored
1
3rdparty/ffmpeg/ffopencv.c
vendored
@ -1 +0,0 @@
|
||||
#include "cap_ffmpeg_impl.hpp"
|
520
3rdparty/ffmpeg/license.txt
vendored
Normal file
520
3rdparty/ffmpeg/license.txt
vendored
Normal file
@ -0,0 +1,520 @@
|
||||
Copyright (C) 2001 Fabrice Bellard
|
||||
|
||||
FFmpeg is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
FFmpeg is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with FFmpeg; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
==================================================================================
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
Version 2.1, February 1999
|
||||
|
||||
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
[This is the first released version of the Lesser GPL. It also counts
|
||||
as the successor of the GNU Library Public License, version 2, hence
|
||||
the version number 2.1.]
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
Licenses are intended to guarantee your freedom to share and change
|
||||
free software--to make sure the software is free for all its users.
|
||||
|
||||
This license, the Lesser General Public License, applies to some
|
||||
specially designated software packages--typically libraries--of the
|
||||
Free Software Foundation and other authors who decide to use it. You
|
||||
can use it too, but we suggest you first think carefully about whether
|
||||
this license or the ordinary General Public License is the better
|
||||
strategy to use in any particular case, based on the explanations below.
|
||||
|
||||
When we speak of free software, we are referring to freedom of use,
|
||||
not price. Our General Public Licenses are designed to make sure that
|
||||
you have the freedom to distribute copies of free software (and charge
|
||||
for this service if you wish); that you receive source code or can get
|
||||
it if you want it; that you can change the software and use pieces of
|
||||
it in new free programs; and that you are informed that you can do
|
||||
these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
distributors to deny you these rights or to ask you to surrender these
|
||||
rights. These restrictions translate to certain responsibilities for
|
||||
you if you distribute copies of the library or if you modify it.
|
||||
|
||||
For example, if you distribute copies of the library, whether gratis
|
||||
or for a fee, you must give the recipients all the rights that we gave
|
||||
you. You must make sure that they, too, receive or can get the source
|
||||
code. If you link other code with the library, you must provide
|
||||
complete object files to the recipients, so that they can relink them
|
||||
with the library after making changes to the library and recompiling
|
||||
it. And you must show them these terms so they know their rights.
|
||||
|
||||
We protect your rights with a two-step method: (1) we copyright the
|
||||
library, and (2) we offer you this license, which gives you legal
|
||||
permission to copy, distribute and/or modify the library.
|
||||
|
||||
To protect each distributor, we want to make it very clear that
|
||||
there is no warranty for the free library. Also, if the library is
|
||||
modified by someone else and passed on, the recipients should know
|
||||
that what they have is not the original version, so that the original
|
||||
author's reputation will not be affected by problems that might be
|
||||
introduced by others.
|
||||
|
||||
Finally, software patents pose a constant threat to the existence of
|
||||
any free program. We wish to make sure that a company cannot
|
||||
effectively restrict the users of a free program by obtaining a
|
||||
restrictive license from a patent holder. Therefore, we insist that
|
||||
any patent license obtained for a version of the library must be
|
||||
consistent with the full freedom of use specified in this license.
|
||||
|
||||
Most GNU software, including some libraries, is covered by the
|
||||
ordinary GNU General Public License. This license, the GNU Lesser
|
||||
General Public License, applies to certain designated libraries, and
|
||||
is quite different from the ordinary General Public License. We use
|
||||
this license for certain libraries in order to permit linking those
|
||||
libraries into non-free programs.
|
||||
|
||||
When a program is linked with a library, whether statically or using
|
||||
a shared library, the combination of the two is legally speaking a
|
||||
combined work, a derivative of the original library. The ordinary
|
||||
General Public License therefore permits such linking only if the
|
||||
entire combination fits its criteria of freedom. The Lesser General
|
||||
Public License permits more lax criteria for linking other code with
|
||||
the library.
|
||||
|
||||
We call this license the "Lesser" General Public License because it
|
||||
does Less to protect the user's freedom than the ordinary General
|
||||
Public License. It also provides other free software developers Less
|
||||
of an advantage over competing non-free programs. These disadvantages
|
||||
are the reason we use the ordinary General Public License for many
|
||||
libraries. However, the Lesser license provides advantages in certain
|
||||
special circumstances.
|
||||
|
||||
For example, on rare occasions, there may be a special need to
|
||||
encourage the widest possible use of a certain library, so that it becomes
|
||||
a de-facto standard. To achieve this, non-free programs must be
|
||||
allowed to use the library. A more frequent case is that a free
|
||||
library does the same job as widely used non-free libraries. In this
|
||||
case, there is little to gain by limiting the free library to free
|
||||
software only, so we use the Lesser General Public License.
|
||||
|
||||
In other cases, permission to use a particular library in non-free
|
||||
programs enables a greater number of people to use a large body of
|
||||
free software. For example, permission to use the GNU C Library in
|
||||
non-free programs enables many more people to use the whole GNU
|
||||
operating system, as well as its variant, the GNU/Linux operating
|
||||
system.
|
||||
|
||||
Although the Lesser General Public License is Less protective of the
|
||||
users' freedom, it does ensure that the user of a program that is
|
||||
linked with the Library has the freedom and the wherewithal to run
|
||||
that program using a modified version of the Library.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow. Pay close attention to the difference between a
|
||||
"work based on the library" and a "work that uses the library". The
|
||||
former contains code derived from the library, whereas the latter must
|
||||
be combined with the library in order to run.
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License Agreement applies to any software library or other
|
||||
program which contains a notice placed by the copyright holder or
|
||||
other authorized party saying it may be distributed under the terms of
|
||||
this Lesser General Public License (also called "this License").
|
||||
Each licensee is addressed as "you".
|
||||
|
||||
A "library" means a collection of software functions and/or data
|
||||
prepared so as to be conveniently linked with application programs
|
||||
(which use some of those functions and data) to form executables.
|
||||
|
||||
The "Library", below, refers to any such software library or work
|
||||
which has been distributed under these terms. A "work based on the
|
||||
Library" means either the Library or any derivative work under
|
||||
copyright law: that is to say, a work containing the Library or a
|
||||
portion of it, either verbatim or with modifications and/or translated
|
||||
straightforwardly into another language. (Hereinafter, translation is
|
||||
included without limitation in the term "modification".)
|
||||
|
||||
"Source code" for a work means the preferred form of the work for
|
||||
making modifications to it. For a library, complete source code means
|
||||
all the source code for all modules it contains, plus any associated
|
||||
interface definition files, plus the scripts used to control compilation
|
||||
and installation of the library.
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running a program using the Library is not restricted, and output from
|
||||
such a program is covered only if its contents constitute a work based
|
||||
on the Library (independent of the use of the Library in a tool for
|
||||
writing it). Whether that is true depends on what the Library does
|
||||
and what the program that uses the Library does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Library's
|
||||
complete source code as you receive it, in any medium, provided that
|
||||
you conspicuously and appropriately publish on each copy an
|
||||
appropriate copyright notice and disclaimer of warranty; keep intact
|
||||
all the notices that refer to this License and to the absence of any
|
||||
warranty; and distribute a copy of this License along with the
|
||||
Library.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy,
|
||||
and you may at your option offer warranty protection in exchange for a
|
||||
fee.
|
||||
|
||||
2. You may modify your copy or copies of the Library or any portion
|
||||
of it, thus forming a work based on the Library, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) The modified work must itself be a software library.
|
||||
|
||||
b) You must cause the files modified to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
c) You must cause the whole of the work to be licensed at no
|
||||
charge to all third parties under the terms of this License.
|
||||
|
||||
d) If a facility in the modified Library refers to a function or a
|
||||
table of data to be supplied by an application program that uses
|
||||
the facility, other than as an argument passed when the facility
|
||||
is invoked, then you must make a good faith effort to ensure that,
|
||||
in the event an application does not supply such function or
|
||||
table, the facility still operates, and performs whatever part of
|
||||
its purpose remains meaningful.
|
||||
|
||||
(For example, a function in a library to compute square roots has
|
||||
a purpose that is entirely well-defined independent of the
|
||||
application. Therefore, Subsection 2d requires that any
|
||||
application-supplied function or table used by this function must
|
||||
be optional: if the application does not supply it, the square
|
||||
root function must still compute square roots.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Library,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Library, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote
|
||||
it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Library.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Library
|
||||
with the Library (or with a work based on the Library) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may opt to apply the terms of the ordinary GNU General Public
|
||||
License instead of this License to a given copy of the Library. To do
|
||||
this, you must alter all the notices that refer to this License, so
|
||||
that they refer to the ordinary GNU General Public License, version 2,
|
||||
instead of to this License. (If a newer version than version 2 of the
|
||||
ordinary GNU General Public License has appeared, then you can specify
|
||||
that version instead if you wish.) Do not make any other change in
|
||||
these notices.
|
||||
|
||||
Once this change is made in a given copy, it is irreversible for
|
||||
that copy, so the ordinary GNU General Public License applies to all
|
||||
subsequent copies and derivative works made from that copy.
|
||||
|
||||
This option is useful when you wish to copy part of the code of
|
||||
the Library into a program that is not a library.
|
||||
|
||||
4. You may copy and distribute the Library (or a portion or
|
||||
derivative of it, under Section 2) in object code or executable form
|
||||
under the terms of Sections 1 and 2 above provided that you accompany
|
||||
it with the complete corresponding machine-readable source code, which
|
||||
must be distributed under the terms of Sections 1 and 2 above on a
|
||||
medium customarily used for software interchange.
|
||||
|
||||
If distribution of object code is made by offering access to copy
|
||||
from a designated place, then offering equivalent access to copy the
|
||||
source code from the same place satisfies the requirement to
|
||||
distribute the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
5. A program that contains no derivative of any portion of the
|
||||
Library, but is designed to work with the Library by being compiled or
|
||||
linked with it, is called a "work that uses the Library". Such a
|
||||
work, in isolation, is not a derivative work of the Library, and
|
||||
therefore falls outside the scope of this License.
|
||||
|
||||
However, linking a "work that uses the Library" with the Library
|
||||
creates an executable that is a derivative of the Library (because it
|
||||
contains portions of the Library), rather than a "work that uses the
|
||||
library". The executable is therefore covered by this License.
|
||||
Section 6 states terms for distribution of such executables.
|
||||
|
||||
When a "work that uses the Library" uses material from a header file
|
||||
that is part of the Library, the object code for the work may be a
|
||||
derivative work of the Library even though the source code is not.
|
||||
Whether this is true is especially significant if the work can be
|
||||
linked without the Library, or if the work is itself a library. The
|
||||
threshold for this to be true is not precisely defined by law.
|
||||
|
||||
If such an object file uses only numerical parameters, data
|
||||
structure layouts and accessors, and small macros and small inline
|
||||
functions (ten lines or less in length), then the use of the object
|
||||
file is unrestricted, regardless of whether it is legally a derivative
|
||||
work. (Executables containing this object code plus portions of the
|
||||
Library will still fall under Section 6.)
|
||||
|
||||
Otherwise, if the work is a derivative of the Library, you may
|
||||
distribute the object code for the work under the terms of Section 6.
|
||||
Any executables containing that work also fall under Section 6,
|
||||
whether or not they are linked directly with the Library itself.
|
||||
|
||||
6. As an exception to the Sections above, you may also combine or
|
||||
link a "work that uses the Library" with the Library to produce a
|
||||
work containing portions of the Library, and distribute that work
|
||||
under terms of your choice, provided that the terms permit
|
||||
modification of the work for the customer's own use and reverse
|
||||
engineering for debugging such modifications.
|
||||
|
||||
You must give prominent notice with each copy of the work that the
|
||||
Library is used in it and that the Library and its use are covered by
|
||||
this License. You must supply a copy of this License. If the work
|
||||
during execution displays copyright notices, you must include the
|
||||
copyright notice for the Library among them, as well as a reference
|
||||
directing the user to the copy of this License. Also, you must do one
|
||||
of these things:
|
||||
|
||||
a) Accompany the work with the complete corresponding
|
||||
machine-readable source code for the Library including whatever
|
||||
changes were used in the work (which must be distributed under
|
||||
Sections 1 and 2 above); and, if the work is an executable linked
|
||||
with the Library, with the complete machine-readable "work that
|
||||
uses the Library", as object code and/or source code, so that the
|
||||
user can modify the Library and then relink to produce a modified
|
||||
executable containing the modified Library. (It is understood
|
||||
that the user who changes the contents of definitions files in the
|
||||
Library will not necessarily be able to recompile the application
|
||||
to use the modified definitions.)
|
||||
|
||||
b) Use a suitable shared library mechanism for linking with the
|
||||
Library. A suitable mechanism is one that (1) uses at run time a
|
||||
copy of the library already present on the user's computer system,
|
||||
rather than copying library functions into the executable, and (2)
|
||||
will operate properly with a modified version of the library, if
|
||||
the user installs one, as long as the modified version is
|
||||
interface-compatible with the version that the work was made with.
|
||||
|
||||
c) Accompany the work with a written offer, valid for at
|
||||
least three years, to give the same user the materials
|
||||
specified in Subsection 6a, above, for a charge no more
|
||||
than the cost of performing this distribution.
|
||||
|
||||
d) If distribution of the work is made by offering access to copy
|
||||
from a designated place, offer equivalent access to copy the above
|
||||
specified materials from the same place.
|
||||
|
||||
e) Verify that the user has already received a copy of these
|
||||
materials or that you have already sent this user a copy.
|
||||
|
||||
For an executable, the required form of the "work that uses the
|
||||
Library" must include any data and utility programs needed for
|
||||
reproducing the executable from it. However, as a special exception,
|
||||
the materials to be distributed need not include anything that is
|
||||
normally distributed (in either source or binary form) with the major
|
||||
components (compiler, kernel, and so on) of the operating system on
|
||||
which the executable runs, unless that component itself accompanies
|
||||
the executable.
|
||||
|
||||
It may happen that this requirement contradicts the license
|
||||
restrictions of other proprietary libraries that do not normally
|
||||
accompany the operating system. Such a contradiction means you cannot
|
||||
use both them and the Library together in an executable that you
|
||||
distribute.
|
||||
|
||||
7. You may place library facilities that are a work based on the
|
||||
Library side-by-side in a single library together with other library
|
||||
facilities not covered by this License, and distribute such a combined
|
||||
library, provided that the separate distribution of the work based on
|
||||
the Library and of the other library facilities is otherwise
|
||||
permitted, and provided that you do these two things:
|
||||
|
||||
a) Accompany the combined library with a copy of the same work
|
||||
based on the Library, uncombined with any other library
|
||||
facilities. This must be distributed under the terms of the
|
||||
Sections above.
|
||||
|
||||
b) Give prominent notice with the combined library of the fact
|
||||
that part of it is a work based on the Library, and explaining
|
||||
where to find the accompanying uncombined form of the same work.
|
||||
|
||||
8. You may not copy, modify, sublicense, link with, or distribute
|
||||
the Library except as expressly provided under this License. Any
|
||||
attempt otherwise to copy, modify, sublicense, link with, or
|
||||
distribute the Library is void, and will automatically terminate your
|
||||
rights under this License. However, parties who have received copies,
|
||||
or rights, from you under this License will not have their licenses
|
||||
terminated so long as such parties remain in full compliance.
|
||||
|
||||
9. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Library or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Library (or any work based on the
|
||||
Library), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Library or works based on it.
|
||||
|
||||
10. Each time you redistribute the Library (or any work based on the
|
||||
Library), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute, link with or modify the Library
|
||||
subject to these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties with
|
||||
this License.
|
||||
|
||||
11. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Library at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Library by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Library.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under any
|
||||
particular circumstance, the balance of the section is intended to apply,
|
||||
and the section as a whole is intended to apply in other circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
12. If the distribution and/or use of the Library is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Library under this License may add
|
||||
an explicit geographical distribution limitation excluding those countries,
|
||||
so that distribution is permitted only in or among countries not thus
|
||||
excluded. In such case, this License incorporates the limitation as if
|
||||
written in the body of this License.
|
||||
|
||||
13. The Free Software Foundation may publish revised and/or new
|
||||
versions of the Lesser General Public License from time to time.
|
||||
Such new versions will be similar in spirit to the present version,
|
||||
but may differ in detail to address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Library
|
||||
specifies a version number of this License which applies to it and
|
||||
"any later version", you have the option of following the terms and
|
||||
conditions either of that version or of any later version published by
|
||||
the Free Software Foundation. If the Library does not specify a
|
||||
license version number, you may choose any version ever published by
|
||||
the Free Software Foundation.
|
||||
|
||||
14. If you wish to incorporate parts of the Library into other free
|
||||
programs whose distribution conditions are incompatible with these,
|
||||
write to the author to ask for permission. For software which is
|
||||
copyrighted by the Free Software Foundation, write to the Free
|
||||
Software Foundation; we sometimes make exceptions for this. Our
|
||||
decision will be guided by the two goals of preserving the free status
|
||||
of all derivatives of our free software and of promoting the sharing
|
||||
and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
|
||||
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
|
||||
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
|
||||
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
|
||||
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
|
||||
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
|
||||
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
|
||||
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
|
||||
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
|
||||
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Libraries
|
||||
|
||||
If you develop a new library, and you want it to be of the greatest
|
||||
possible use to the public, we recommend making it free software that
|
||||
everyone can redistribute and change. You can do so by permitting
|
||||
redistribution under these terms (or, alternatively, under the terms of the
|
||||
ordinary General Public License).
|
||||
|
||||
To apply these terms, attach the following notices to the library. It is
|
||||
safest to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least the
|
||||
"copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the library's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the library, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the
|
||||
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1990
|
||||
Ty Coon, President of Vice
|
||||
|
||||
That's all there is to it!
|
2
3rdparty/ffmpeg/make.bat
vendored
2
3rdparty/ffmpeg/make.bat
vendored
@ -1,2 +0,0 @@
|
||||
set path=c:\dev\msys32\bin;%path% & gcc -Wall -shared -o opencv_ffmpeg.dll -O2 -x c++ -I../include -I../include/ffmpeg_ -I../../modules/highgui/src ffopencv.c -L../lib -lavformat -lavcodec -lavdevice -lswscale -lavutil -lws2_32
|
||||
set path=c:\dev\msys64\bin;%path% & gcc -m64 -Wall -shared -o opencv_ffmpeg_64.dll -O2 -x c++ -I../include -I../include/ffmpeg_ -I../../modules/highgui/src ffopencv.c -L../lib -lavformat64 -lavcodec64 -lavdevice64 -lswscale64 -lavutil64 -lws2_32
|
BIN
3rdparty/ffmpeg/opencv_ffmpeg.dll
vendored
BIN
3rdparty/ffmpeg/opencv_ffmpeg.dll
vendored
Binary file not shown.
BIN
3rdparty/ffmpeg/opencv_ffmpeg_64.dll
vendored
BIN
3rdparty/ffmpeg/opencv_ffmpeg_64.dll
vendored
Binary file not shown.
64
3rdparty/ffmpeg/readme.txt
vendored
64
3rdparty/ffmpeg/readme.txt
vendored
@ -1,42 +1,38 @@
|
||||
The build script is to be fixed.
|
||||
Right now it assumes that 32-bit MinGW is in the system path and
|
||||
64-bit mingw is installed to c:\Apps\MinGW64.
|
||||
* On Linux and other Unix flavors OpenCV uses default or user-built ffmpeg/libav libraries.
|
||||
If user builds ffmpeg/libav from source and wants OpenCV to stay BSD library, not GPL/LGPL,
|
||||
he/she should use --enabled-shared configure flag and make sure that no GPL components are
|
||||
enabled (some notable examples are x264 (H264 encoder) and libac3 (Dolby AC3 audio codec)).
|
||||
See https://www.ffmpeg.org/legal.html for details.
|
||||
|
||||
It is important that gcc is used, not g++!
|
||||
Otherwise the produced DLL will likely be dependent on libgcc_s_dw2-1.dll or similar DLL.
|
||||
While we want to make the DLLs with minimum dependencies: Win32 libraries + msvcrt.dll.
|
||||
If you want to play very safe and do not want to use FFMPEG at all, regardless of whether it's installed on
|
||||
your system or not, configure and build OpenCV using CMake with WITH_FFMPEG=OFF flag. OpenCV will then use
|
||||
AVFoundation (OSX), GStreamer (Linux) or other available backends supported by opencv_videoio module.
|
||||
|
||||
ffopencv.c is really a C++ source, hence -x c++ is used.
|
||||
There is also our self-contained motion jpeg codec, which you can use without any worries.
|
||||
It handles CV_FOURCC('M', 'J', 'P', 'G') streams within an AVI container (".avi").
|
||||
|
||||
How to update opencv_ffmpeg.dll and opencv_ffmpeg_64.dll when a new version of FFMPEG is release?
|
||||
* On Windows OpenCV uses pre-built ffmpeg binaries, built with proper flags (without GPL components) and
|
||||
wrapped with simple, stable OpenCV-compatible API.
|
||||
The binaries are opencv_ffmpeg.dll (version for 32-bit Windows) and
|
||||
opencv_ffmpeg_64.dll (version for 64-bit Windows).
|
||||
|
||||
1. Install 32-bit MinGW + MSYS from
|
||||
http://sourceforge.net/projects/mingw/files/Automated%20MinGW%20Installer/mingw-get-inst/
|
||||
Let's assume, it's installed in C:\MSYS32.
|
||||
2. Install 64-bit MinGW. http://mingw-w64.sourceforge.net/
|
||||
Let's assume, it's installed in C:\MSYS64
|
||||
3. Copy C:\MSYS32\msys to C:\MSYS64\msys. Edit C:\MSYS64\msys\etc\fstab, change C:\MSYS32 to C:\MSYS64.
|
||||
See build_win32.txt for the build instructions, if you want to rebuild opencv_ffmpeg*.dll from scratch.
|
||||
|
||||
4. Now you have working MSYS32 and MSYS64 environments.
|
||||
Launch, one by one, C:\MSYS32\msys\msys.bat and C:\MSYS64\msys\msys.bat to create your home directories.
|
||||
The pre-built opencv_ffmpeg*.dll is:
|
||||
* LGPL library, not BSD libraries.
|
||||
* Loaded at runtime by opencv_videoio module.
|
||||
If it succeeds, ffmpeg can be used to decode/encode videos;
|
||||
otherwise, other API is used.
|
||||
|
||||
4. Download ffmpeg-x.y.z.tar.gz (where x.y.z denotes the actual ffmpeg version).
|
||||
Copy it to C:\MSYS{32|64}\msys\home\<loginname> directory.
|
||||
FFMPEG build contains H264 encoder based on the OpenH264 library, that should be installed separatelly.
|
||||
OpenH264 Video Codec provided by Cisco Systems, Inc.
|
||||
See https://github.com/cisco/openh264/releases for details and OpenH264 license.
|
||||
Downloaded binary file can be placed into global system path (System32 or SysWOW64) or near application binaries.
|
||||
You can also specify location of binary file via OPENH264_LIBRARY_PATH environment variable.
|
||||
|
||||
5. To build 32-bit ffmpeg libraries, run C:\MSYS32\msys\msys.bat and type the following commands:
|
||||
If LGPL/GPL software can not be supplied with your OpenCV-based product, simply exclude
|
||||
opencv_ffmpeg*.dll from your distribution; OpenCV will stay fully functional except for the ability to
|
||||
decode/encode videos using FFMPEG (though, it may still be able to do that using other API,
|
||||
such as Video for Windows, Windows Media Foundation or our self-contained motion jpeg codec).
|
||||
|
||||
5.1. tar -xzf ffmpeg-x.y.z.tar.gz
|
||||
5.2. mkdir build
|
||||
5.3. cd build
|
||||
5.4. ../ffmpeg-x.y.z/configure --enable-w32threads
|
||||
5.5. make
|
||||
5.6. make install
|
||||
5.7. cd /local/lib
|
||||
5.8. strip -g *.a
|
||||
|
||||
6. Then repeat the same for 64-bit case. The output libs: libavcodec.a etc. need to be renamed to libavcodec64.a etc.
|
||||
|
||||
7. Then, copy all those libs to <opencv>\3rdparty\lib\, copy the headers to <opencv>\3rdparty\include\ffmpeg_.
|
||||
|
||||
8. Then, go to <opencv>\3rdparty\ffmpeg, edit make.bat
|
||||
(change paths to the actual paths to your msys32 and msys64 distributions) and then run make.bat
|
||||
See license.txt for the FFMPEG copyright notice and the licensing terms.
|
||||
|
4863
3rdparty/include/ffmpeg_/libavcodec/avcodec.h
vendored
4863
3rdparty/include/ffmpeg_/libavcodec/avcodec.h
vendored
File diff suppressed because it is too large
Load Diff
116
3rdparty/include/ffmpeg_/libavcodec/avfft.h
vendored
116
3rdparty/include/ffmpeg_/libavcodec/avfft.h
vendored
@ -1,116 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AVFFT_H
|
||||
#define AVCODEC_AVFFT_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavc_fft
|
||||
* FFT functions
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup lavc_fft FFT functions
|
||||
* @ingroup lavc_misc
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
typedef float FFTSample;
|
||||
|
||||
typedef struct FFTComplex {
|
||||
FFTSample re, im;
|
||||
} FFTComplex;
|
||||
|
||||
typedef struct FFTContext FFTContext;
|
||||
|
||||
/**
|
||||
* Set up a complex FFT.
|
||||
* @param nbits log2 of the length of the input array
|
||||
* @param inverse if 0 perform the forward transform, if 1 perform the inverse
|
||||
*/
|
||||
FFTContext *av_fft_init(int nbits, int inverse);
|
||||
|
||||
/**
|
||||
* Do the permutation needed BEFORE calling ff_fft_calc().
|
||||
*/
|
||||
void av_fft_permute(FFTContext *s, FFTComplex *z);
|
||||
|
||||
/**
|
||||
* Do a complex FFT with the parameters defined in av_fft_init(). The
|
||||
* input data must be permuted before. No 1.0/sqrt(n) normalization is done.
|
||||
*/
|
||||
void av_fft_calc(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void av_fft_end(FFTContext *s);
|
||||
|
||||
FFTContext *av_mdct_init(int nbits, int inverse, double scale);
|
||||
void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void av_mdct_end(FFTContext *s);
|
||||
|
||||
/* Real Discrete Fourier Transform */
|
||||
|
||||
enum RDFTransformType {
|
||||
DFT_R2C,
|
||||
IDFT_C2R,
|
||||
IDFT_R2C,
|
||||
DFT_C2R,
|
||||
};
|
||||
|
||||
typedef struct RDFTContext RDFTContext;
|
||||
|
||||
/**
|
||||
* Set up a real FFT.
|
||||
* @param nbits log2 of the length of the input array
|
||||
* @param trans the type of transform
|
||||
*/
|
||||
RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans);
|
||||
void av_rdft_calc(RDFTContext *s, FFTSample *data);
|
||||
void av_rdft_end(RDFTContext *s);
|
||||
|
||||
/* Discrete Cosine Transform */
|
||||
|
||||
typedef struct DCTContext DCTContext;
|
||||
|
||||
enum DCTTransformType {
|
||||
DCT_II = 0,
|
||||
DCT_III,
|
||||
DCT_I,
|
||||
DST_I,
|
||||
};
|
||||
|
||||
/**
|
||||
* Set up DCT.
|
||||
* @param nbits size of the input array:
|
||||
* (1 << nbits) for DCT-II, DCT-III and DST-I
|
||||
* (1 << nbits) + 1 for DCT-I
|
||||
*
|
||||
* @note the first element of the input of DST-I is ignored
|
||||
*/
|
||||
DCTContext *av_dct_init(int nbits, enum DCTTransformType type);
|
||||
void av_dct_calc(DCTContext *s, FFTSample *data);
|
||||
void av_dct_end (DCTContext *s);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVCODEC_AVFFT_H */
|
95
3rdparty/include/ffmpeg_/libavcodec/dxva2.h
vendored
95
3rdparty/include/ffmpeg_/libavcodec/dxva2.h
vendored
@ -1,95 +0,0 @@
|
||||
/*
|
||||
* DXVA2 HW acceleration
|
||||
*
|
||||
* copyright (c) 2009 Laurent Aimar
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_DXVA_H
|
||||
#define AVCODEC_DXVA_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavc_codec_hwaccel_dxva2
|
||||
* Public libavcodec DXVA2 header.
|
||||
*/
|
||||
|
||||
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0600
|
||||
#undef _WIN32_WINNT
|
||||
#endif
|
||||
|
||||
#if !defined(_WIN32_WINNT)
|
||||
#define _WIN32_WINNT 0x0600
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <d3d9.h>
|
||||
#include <dxva2api.h>
|
||||
|
||||
/**
|
||||
* @defgroup lavc_codec_hwaccel_dxva2 DXVA2
|
||||
* @ingroup lavc_codec_hwaccel
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
#define FF_DXVA2_WORKAROUND_SCALING_LIST_ZIGZAG 1 ///< Work around for DXVA2 and old UVD/UVD+ ATI video cards
|
||||
|
||||
/**
|
||||
* This structure is used to provides the necessary configurations and data
|
||||
* to the DXVA2 FFmpeg HWAccel implementation.
|
||||
*
|
||||
* The application must make it available as AVCodecContext.hwaccel_context.
|
||||
*/
|
||||
struct dxva_context {
|
||||
/**
|
||||
* DXVA2 decoder object
|
||||
*/
|
||||
IDirectXVideoDecoder *decoder;
|
||||
|
||||
/**
|
||||
* DXVA2 configuration used to create the decoder
|
||||
*/
|
||||
const DXVA2_ConfigPictureDecode *cfg;
|
||||
|
||||
/**
|
||||
* The number of surface in the surface array
|
||||
*/
|
||||
unsigned surface_count;
|
||||
|
||||
/**
|
||||
* The array of Direct3D surfaces used to create the decoder
|
||||
*/
|
||||
LPDIRECT3DSURFACE9 *surface;
|
||||
|
||||
/**
|
||||
* A bit field configuring the workarounds needed for using the decoder
|
||||
*/
|
||||
uint64_t workaround;
|
||||
|
||||
/**
|
||||
* Private to the FFmpeg AVHWAccel implementation
|
||||
*/
|
||||
unsigned report_id;
|
||||
};
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVCODEC_DXVA_H */
|
397
3rdparty/include/ffmpeg_/libavcodec/old_codec_ids.h
vendored
397
3rdparty/include/ffmpeg_/libavcodec/old_codec_ids.h
vendored
@ -1,397 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_OLD_CODEC_IDS_H
|
||||
#define AVCODEC_OLD_CODEC_IDS_H
|
||||
|
||||
#include "libavutil/common.h"
|
||||
|
||||
/*
|
||||
* This header exists to prevent new codec IDs from being accidentally added to
|
||||
* the deprecated list.
|
||||
* Do not include it directly. It will be removed on next major bump
|
||||
*
|
||||
* Do not add new items to this list. Use the AVCodecID enum instead.
|
||||
*/
|
||||
|
||||
CODEC_ID_NONE = AV_CODEC_ID_NONE,
|
||||
|
||||
/* video codecs */
|
||||
CODEC_ID_MPEG1VIDEO,
|
||||
CODEC_ID_MPEG2VIDEO, ///< preferred ID for MPEG-1/2 video decoding
|
||||
CODEC_ID_MPEG2VIDEO_XVMC,
|
||||
CODEC_ID_H261,
|
||||
CODEC_ID_H263,
|
||||
CODEC_ID_RV10,
|
||||
CODEC_ID_RV20,
|
||||
CODEC_ID_MJPEG,
|
||||
CODEC_ID_MJPEGB,
|
||||
CODEC_ID_LJPEG,
|
||||
CODEC_ID_SP5X,
|
||||
CODEC_ID_JPEGLS,
|
||||
CODEC_ID_MPEG4,
|
||||
CODEC_ID_RAWVIDEO,
|
||||
CODEC_ID_MSMPEG4V1,
|
||||
CODEC_ID_MSMPEG4V2,
|
||||
CODEC_ID_MSMPEG4V3,
|
||||
CODEC_ID_WMV1,
|
||||
CODEC_ID_WMV2,
|
||||
CODEC_ID_H263P,
|
||||
CODEC_ID_H263I,
|
||||
CODEC_ID_FLV1,
|
||||
CODEC_ID_SVQ1,
|
||||
CODEC_ID_SVQ3,
|
||||
CODEC_ID_DVVIDEO,
|
||||
CODEC_ID_HUFFYUV,
|
||||
CODEC_ID_CYUV,
|
||||
CODEC_ID_H264,
|
||||
CODEC_ID_INDEO3,
|
||||
CODEC_ID_VP3,
|
||||
CODEC_ID_THEORA,
|
||||
CODEC_ID_ASV1,
|
||||
CODEC_ID_ASV2,
|
||||
CODEC_ID_FFV1,
|
||||
CODEC_ID_4XM,
|
||||
CODEC_ID_VCR1,
|
||||
CODEC_ID_CLJR,
|
||||
CODEC_ID_MDEC,
|
||||
CODEC_ID_ROQ,
|
||||
CODEC_ID_INTERPLAY_VIDEO,
|
||||
CODEC_ID_XAN_WC3,
|
||||
CODEC_ID_XAN_WC4,
|
||||
CODEC_ID_RPZA,
|
||||
CODEC_ID_CINEPAK,
|
||||
CODEC_ID_WS_VQA,
|
||||
CODEC_ID_MSRLE,
|
||||
CODEC_ID_MSVIDEO1,
|
||||
CODEC_ID_IDCIN,
|
||||
CODEC_ID_8BPS,
|
||||
CODEC_ID_SMC,
|
||||
CODEC_ID_FLIC,
|
||||
CODEC_ID_TRUEMOTION1,
|
||||
CODEC_ID_VMDVIDEO,
|
||||
CODEC_ID_MSZH,
|
||||
CODEC_ID_ZLIB,
|
||||
CODEC_ID_QTRLE,
|
||||
CODEC_ID_TSCC,
|
||||
CODEC_ID_ULTI,
|
||||
CODEC_ID_QDRAW,
|
||||
CODEC_ID_VIXL,
|
||||
CODEC_ID_QPEG,
|
||||
CODEC_ID_PNG,
|
||||
CODEC_ID_PPM,
|
||||
CODEC_ID_PBM,
|
||||
CODEC_ID_PGM,
|
||||
CODEC_ID_PGMYUV,
|
||||
CODEC_ID_PAM,
|
||||
CODEC_ID_FFVHUFF,
|
||||
CODEC_ID_RV30,
|
||||
CODEC_ID_RV40,
|
||||
CODEC_ID_VC1,
|
||||
CODEC_ID_WMV3,
|
||||
CODEC_ID_LOCO,
|
||||
CODEC_ID_WNV1,
|
||||
CODEC_ID_AASC,
|
||||
CODEC_ID_INDEO2,
|
||||
CODEC_ID_FRAPS,
|
||||
CODEC_ID_TRUEMOTION2,
|
||||
CODEC_ID_BMP,
|
||||
CODEC_ID_CSCD,
|
||||
CODEC_ID_MMVIDEO,
|
||||
CODEC_ID_ZMBV,
|
||||
CODEC_ID_AVS,
|
||||
CODEC_ID_SMACKVIDEO,
|
||||
CODEC_ID_NUV,
|
||||
CODEC_ID_KMVC,
|
||||
CODEC_ID_FLASHSV,
|
||||
CODEC_ID_CAVS,
|
||||
CODEC_ID_JPEG2000,
|
||||
CODEC_ID_VMNC,
|
||||
CODEC_ID_VP5,
|
||||
CODEC_ID_VP6,
|
||||
CODEC_ID_VP6F,
|
||||
CODEC_ID_TARGA,
|
||||
CODEC_ID_DSICINVIDEO,
|
||||
CODEC_ID_TIERTEXSEQVIDEO,
|
||||
CODEC_ID_TIFF,
|
||||
CODEC_ID_GIF,
|
||||
CODEC_ID_DXA,
|
||||
CODEC_ID_DNXHD,
|
||||
CODEC_ID_THP,
|
||||
CODEC_ID_SGI,
|
||||
CODEC_ID_C93,
|
||||
CODEC_ID_BETHSOFTVID,
|
||||
CODEC_ID_PTX,
|
||||
CODEC_ID_TXD,
|
||||
CODEC_ID_VP6A,
|
||||
CODEC_ID_AMV,
|
||||
CODEC_ID_VB,
|
||||
CODEC_ID_PCX,
|
||||
CODEC_ID_SUNRAST,
|
||||
CODEC_ID_INDEO4,
|
||||
CODEC_ID_INDEO5,
|
||||
CODEC_ID_MIMIC,
|
||||
CODEC_ID_RL2,
|
||||
CODEC_ID_ESCAPE124,
|
||||
CODEC_ID_DIRAC,
|
||||
CODEC_ID_BFI,
|
||||
CODEC_ID_CMV,
|
||||
CODEC_ID_MOTIONPIXELS,
|
||||
CODEC_ID_TGV,
|
||||
CODEC_ID_TGQ,
|
||||
CODEC_ID_TQI,
|
||||
CODEC_ID_AURA,
|
||||
CODEC_ID_AURA2,
|
||||
CODEC_ID_V210X,
|
||||
CODEC_ID_TMV,
|
||||
CODEC_ID_V210,
|
||||
CODEC_ID_DPX,
|
||||
CODEC_ID_MAD,
|
||||
CODEC_ID_FRWU,
|
||||
CODEC_ID_FLASHSV2,
|
||||
CODEC_ID_CDGRAPHICS,
|
||||
CODEC_ID_R210,
|
||||
CODEC_ID_ANM,
|
||||
CODEC_ID_BINKVIDEO,
|
||||
CODEC_ID_IFF_ILBM,
|
||||
CODEC_ID_IFF_BYTERUN1,
|
||||
CODEC_ID_KGV1,
|
||||
CODEC_ID_YOP,
|
||||
CODEC_ID_VP8,
|
||||
CODEC_ID_PICTOR,
|
||||
CODEC_ID_ANSI,
|
||||
CODEC_ID_A64_MULTI,
|
||||
CODEC_ID_A64_MULTI5,
|
||||
CODEC_ID_R10K,
|
||||
CODEC_ID_MXPEG,
|
||||
CODEC_ID_LAGARITH,
|
||||
CODEC_ID_PRORES,
|
||||
CODEC_ID_JV,
|
||||
CODEC_ID_DFA,
|
||||
CODEC_ID_WMV3IMAGE,
|
||||
CODEC_ID_VC1IMAGE,
|
||||
CODEC_ID_UTVIDEO,
|
||||
CODEC_ID_BMV_VIDEO,
|
||||
CODEC_ID_VBLE,
|
||||
CODEC_ID_DXTORY,
|
||||
CODEC_ID_V410,
|
||||
CODEC_ID_XWD,
|
||||
CODEC_ID_CDXL,
|
||||
CODEC_ID_XBM,
|
||||
CODEC_ID_ZEROCODEC,
|
||||
CODEC_ID_MSS1,
|
||||
CODEC_ID_MSA1,
|
||||
CODEC_ID_TSCC2,
|
||||
CODEC_ID_MTS2,
|
||||
CODEC_ID_CLLC,
|
||||
CODEC_ID_Y41P = MKBETAG('Y','4','1','P'),
|
||||
CODEC_ID_ESCAPE130 = MKBETAG('E','1','3','0'),
|
||||
CODEC_ID_EXR = MKBETAG('0','E','X','R'),
|
||||
CODEC_ID_AVRP = MKBETAG('A','V','R','P'),
|
||||
|
||||
CODEC_ID_G2M = MKBETAG( 0 ,'G','2','M'),
|
||||
CODEC_ID_AVUI = MKBETAG('A','V','U','I'),
|
||||
CODEC_ID_AYUV = MKBETAG('A','Y','U','V'),
|
||||
CODEC_ID_V308 = MKBETAG('V','3','0','8'),
|
||||
CODEC_ID_V408 = MKBETAG('V','4','0','8'),
|
||||
CODEC_ID_YUV4 = MKBETAG('Y','U','V','4'),
|
||||
CODEC_ID_SANM = MKBETAG('S','A','N','M'),
|
||||
CODEC_ID_PAF_VIDEO = MKBETAG('P','A','F','V'),
|
||||
CODEC_ID_SNOW = AV_CODEC_ID_SNOW,
|
||||
|
||||
/* various PCM "codecs" */
|
||||
CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs
|
||||
CODEC_ID_PCM_S16LE = 0x10000,
|
||||
CODEC_ID_PCM_S16BE,
|
||||
CODEC_ID_PCM_U16LE,
|
||||
CODEC_ID_PCM_U16BE,
|
||||
CODEC_ID_PCM_S8,
|
||||
CODEC_ID_PCM_U8,
|
||||
CODEC_ID_PCM_MULAW,
|
||||
CODEC_ID_PCM_ALAW,
|
||||
CODEC_ID_PCM_S32LE,
|
||||
CODEC_ID_PCM_S32BE,
|
||||
CODEC_ID_PCM_U32LE,
|
||||
CODEC_ID_PCM_U32BE,
|
||||
CODEC_ID_PCM_S24LE,
|
||||
CODEC_ID_PCM_S24BE,
|
||||
CODEC_ID_PCM_U24LE,
|
||||
CODEC_ID_PCM_U24BE,
|
||||
CODEC_ID_PCM_S24DAUD,
|
||||
CODEC_ID_PCM_ZORK,
|
||||
CODEC_ID_PCM_S16LE_PLANAR,
|
||||
CODEC_ID_PCM_DVD,
|
||||
CODEC_ID_PCM_F32BE,
|
||||
CODEC_ID_PCM_F32LE,
|
||||
CODEC_ID_PCM_F64BE,
|
||||
CODEC_ID_PCM_F64LE,
|
||||
CODEC_ID_PCM_BLURAY,
|
||||
CODEC_ID_PCM_LXF,
|
||||
CODEC_ID_S302M,
|
||||
CODEC_ID_PCM_S8_PLANAR,
|
||||
|
||||
/* various ADPCM codecs */
|
||||
CODEC_ID_ADPCM_IMA_QT = 0x11000,
|
||||
CODEC_ID_ADPCM_IMA_WAV,
|
||||
CODEC_ID_ADPCM_IMA_DK3,
|
||||
CODEC_ID_ADPCM_IMA_DK4,
|
||||
CODEC_ID_ADPCM_IMA_WS,
|
||||
CODEC_ID_ADPCM_IMA_SMJPEG,
|
||||
CODEC_ID_ADPCM_MS,
|
||||
CODEC_ID_ADPCM_4XM,
|
||||
CODEC_ID_ADPCM_XA,
|
||||
CODEC_ID_ADPCM_ADX,
|
||||
CODEC_ID_ADPCM_EA,
|
||||
CODEC_ID_ADPCM_G726,
|
||||
CODEC_ID_ADPCM_CT,
|
||||
CODEC_ID_ADPCM_SWF,
|
||||
CODEC_ID_ADPCM_YAMAHA,
|
||||
CODEC_ID_ADPCM_SBPRO_4,
|
||||
CODEC_ID_ADPCM_SBPRO_3,
|
||||
CODEC_ID_ADPCM_SBPRO_2,
|
||||
CODEC_ID_ADPCM_THP,
|
||||
CODEC_ID_ADPCM_IMA_AMV,
|
||||
CODEC_ID_ADPCM_EA_R1,
|
||||
CODEC_ID_ADPCM_EA_R3,
|
||||
CODEC_ID_ADPCM_EA_R2,
|
||||
CODEC_ID_ADPCM_IMA_EA_SEAD,
|
||||
CODEC_ID_ADPCM_IMA_EA_EACS,
|
||||
CODEC_ID_ADPCM_EA_XAS,
|
||||
CODEC_ID_ADPCM_EA_MAXIS_XA,
|
||||
CODEC_ID_ADPCM_IMA_ISS,
|
||||
CODEC_ID_ADPCM_G722,
|
||||
CODEC_ID_ADPCM_IMA_APC,
|
||||
CODEC_ID_VIMA = MKBETAG('V','I','M','A'),
|
||||
|
||||
/* AMR */
|
||||
CODEC_ID_AMR_NB = 0x12000,
|
||||
CODEC_ID_AMR_WB,
|
||||
|
||||
/* RealAudio codecs*/
|
||||
CODEC_ID_RA_144 = 0x13000,
|
||||
CODEC_ID_RA_288,
|
||||
|
||||
/* various DPCM codecs */
|
||||
CODEC_ID_ROQ_DPCM = 0x14000,
|
||||
CODEC_ID_INTERPLAY_DPCM,
|
||||
CODEC_ID_XAN_DPCM,
|
||||
CODEC_ID_SOL_DPCM,
|
||||
|
||||
/* audio codecs */
|
||||
CODEC_ID_MP2 = 0x15000,
|
||||
CODEC_ID_MP3, ///< preferred ID for decoding MPEG audio layer 1, 2 or 3
|
||||
CODEC_ID_AAC,
|
||||
CODEC_ID_AC3,
|
||||
CODEC_ID_DTS,
|
||||
CODEC_ID_VORBIS,
|
||||
CODEC_ID_DVAUDIO,
|
||||
CODEC_ID_WMAV1,
|
||||
CODEC_ID_WMAV2,
|
||||
CODEC_ID_MACE3,
|
||||
CODEC_ID_MACE6,
|
||||
CODEC_ID_VMDAUDIO,
|
||||
CODEC_ID_FLAC,
|
||||
CODEC_ID_MP3ADU,
|
||||
CODEC_ID_MP3ON4,
|
||||
CODEC_ID_SHORTEN,
|
||||
CODEC_ID_ALAC,
|
||||
CODEC_ID_WESTWOOD_SND1,
|
||||
CODEC_ID_GSM, ///< as in Berlin toast format
|
||||
CODEC_ID_QDM2,
|
||||
CODEC_ID_COOK,
|
||||
CODEC_ID_TRUESPEECH,
|
||||
CODEC_ID_TTA,
|
||||
CODEC_ID_SMACKAUDIO,
|
||||
CODEC_ID_QCELP,
|
||||
CODEC_ID_WAVPACK,
|
||||
CODEC_ID_DSICINAUDIO,
|
||||
CODEC_ID_IMC,
|
||||
CODEC_ID_MUSEPACK7,
|
||||
CODEC_ID_MLP,
|
||||
CODEC_ID_GSM_MS, /* as found in WAV */
|
||||
CODEC_ID_ATRAC3,
|
||||
CODEC_ID_VOXWARE,
|
||||
CODEC_ID_APE,
|
||||
CODEC_ID_NELLYMOSER,
|
||||
CODEC_ID_MUSEPACK8,
|
||||
CODEC_ID_SPEEX,
|
||||
CODEC_ID_WMAVOICE,
|
||||
CODEC_ID_WMAPRO,
|
||||
CODEC_ID_WMALOSSLESS,
|
||||
CODEC_ID_ATRAC3P,
|
||||
CODEC_ID_EAC3,
|
||||
CODEC_ID_SIPR,
|
||||
CODEC_ID_MP1,
|
||||
CODEC_ID_TWINVQ,
|
||||
CODEC_ID_TRUEHD,
|
||||
CODEC_ID_MP4ALS,
|
||||
CODEC_ID_ATRAC1,
|
||||
CODEC_ID_BINKAUDIO_RDFT,
|
||||
CODEC_ID_BINKAUDIO_DCT,
|
||||
CODEC_ID_AAC_LATM,
|
||||
CODEC_ID_QDMC,
|
||||
CODEC_ID_CELT,
|
||||
CODEC_ID_G723_1,
|
||||
CODEC_ID_G729,
|
||||
CODEC_ID_8SVX_EXP,
|
||||
CODEC_ID_8SVX_FIB,
|
||||
CODEC_ID_BMV_AUDIO,
|
||||
CODEC_ID_RALF,
|
||||
CODEC_ID_IAC,
|
||||
CODEC_ID_ILBC,
|
||||
CODEC_ID_FFWAVESYNTH = MKBETAG('F','F','W','S'),
|
||||
CODEC_ID_SONIC = MKBETAG('S','O','N','C'),
|
||||
CODEC_ID_SONIC_LS = MKBETAG('S','O','N','L'),
|
||||
CODEC_ID_PAF_AUDIO = MKBETAG('P','A','F','A'),
|
||||
CODEC_ID_OPUS = MKBETAG('O','P','U','S'),
|
||||
|
||||
/* subtitle codecs */
|
||||
CODEC_ID_FIRST_SUBTITLE = 0x17000, ///< A dummy ID pointing at the start of subtitle codecs.
|
||||
CODEC_ID_DVD_SUBTITLE = 0x17000,
|
||||
CODEC_ID_DVB_SUBTITLE,
|
||||
CODEC_ID_TEXT, ///< raw UTF-8 text
|
||||
CODEC_ID_XSUB,
|
||||
CODEC_ID_SSA,
|
||||
CODEC_ID_MOV_TEXT,
|
||||
CODEC_ID_HDMV_PGS_SUBTITLE,
|
||||
CODEC_ID_DVB_TELETEXT,
|
||||
CODEC_ID_SRT,
|
||||
CODEC_ID_MICRODVD = MKBETAG('m','D','V','D'),
|
||||
CODEC_ID_EIA_608 = MKBETAG('c','6','0','8'),
|
||||
CODEC_ID_JACOSUB = MKBETAG('J','S','U','B'),
|
||||
CODEC_ID_SAMI = MKBETAG('S','A','M','I'),
|
||||
CODEC_ID_REALTEXT = MKBETAG('R','T','X','T'),
|
||||
CODEC_ID_SUBVIEWER = MKBETAG('S','u','b','V'),
|
||||
|
||||
/* other specific kind of codecs (generally used for attachments) */
|
||||
CODEC_ID_FIRST_UNKNOWN = 0x18000, ///< A dummy ID pointing at the start of various fake codecs.
|
||||
CODEC_ID_TTF = 0x18000,
|
||||
CODEC_ID_BINTEXT = MKBETAG('B','T','X','T'),
|
||||
CODEC_ID_XBIN = MKBETAG('X','B','I','N'),
|
||||
CODEC_ID_IDF = MKBETAG( 0 ,'I','D','F'),
|
||||
CODEC_ID_OTF = MKBETAG( 0 ,'O','T','F'),
|
||||
|
||||
CODEC_ID_PROBE = 0x19000, ///< codec_id is not known (like CODEC_ID_NONE) but lavf should attempt to identify it
|
||||
|
||||
CODEC_ID_MPEG2TS = 0x20000, /**< _FAKE_ codec to indicate a raw MPEG-2 TS
|
||||
* stream (only used by libavformat) */
|
||||
CODEC_ID_MPEG4SYSTEMS = 0x20001, /**< _FAKE_ codec to indicate a MPEG-4 Systems
|
||||
* stream (only used by libavformat) */
|
||||
CODEC_ID_FFMETADATA = 0x21000, ///< Dummy codec for streams containing only metadata information.
|
||||
|
||||
#endif /* AVCODEC_OLD_CODEC_IDS_H */
|
173
3rdparty/include/ffmpeg_/libavcodec/vaapi.h
vendored
173
3rdparty/include/ffmpeg_/libavcodec/vaapi.h
vendored
@ -1,173 +0,0 @@
|
||||
/*
|
||||
* Video Acceleration API (shared data between FFmpeg and the video player)
|
||||
* HW decode acceleration for MPEG-2, MPEG-4, H.264 and VC-1
|
||||
*
|
||||
* Copyright (C) 2008-2009 Splitted-Desktop Systems
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_VAAPI_H
|
||||
#define AVCODEC_VAAPI_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavc_codec_hwaccel_vaapi
|
||||
* Public libavcodec VA API header.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* @defgroup lavc_codec_hwaccel_vaapi VA API Decoding
|
||||
* @ingroup lavc_codec_hwaccel
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* This structure is used to share data between the FFmpeg library and
|
||||
* the client video application.
|
||||
* This shall be zero-allocated and available as
|
||||
* AVCodecContext.hwaccel_context. All user members can be set once
|
||||
* during initialization or through each AVCodecContext.get_buffer()
|
||||
* function call. In any case, they must be valid prior to calling
|
||||
* decoding functions.
|
||||
*/
|
||||
struct vaapi_context {
|
||||
/**
|
||||
* Window system dependent data
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by user
|
||||
*/
|
||||
void *display;
|
||||
|
||||
/**
|
||||
* Configuration ID
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by user
|
||||
*/
|
||||
uint32_t config_id;
|
||||
|
||||
/**
|
||||
* Context ID (video decode pipeline)
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by user
|
||||
*/
|
||||
uint32_t context_id;
|
||||
|
||||
/**
|
||||
* VAPictureParameterBuffer ID
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
uint32_t pic_param_buf_id;
|
||||
|
||||
/**
|
||||
* VAIQMatrixBuffer ID
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
uint32_t iq_matrix_buf_id;
|
||||
|
||||
/**
|
||||
* VABitPlaneBuffer ID (for VC-1 decoding)
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
uint32_t bitplane_buf_id;
|
||||
|
||||
/**
|
||||
* Slice parameter/data buffer IDs
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
uint32_t *slice_buf_ids;
|
||||
|
||||
/**
|
||||
* Number of effective slice buffer IDs to send to the HW
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
unsigned int n_slice_buf_ids;
|
||||
|
||||
/**
|
||||
* Size of pre-allocated slice_buf_ids
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
unsigned int slice_buf_ids_alloc;
|
||||
|
||||
/**
|
||||
* Pointer to VASliceParameterBuffers
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
void *slice_params;
|
||||
|
||||
/**
|
||||
* Size of a VASliceParameterBuffer element
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
unsigned int slice_param_size;
|
||||
|
||||
/**
|
||||
* Size of pre-allocated slice_params
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
unsigned int slice_params_alloc;
|
||||
|
||||
/**
|
||||
* Number of slices currently filled in
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
unsigned int slice_count;
|
||||
|
||||
/**
|
||||
* Pointer to slice data buffer base
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
const uint8_t *slice_data;
|
||||
|
||||
/**
|
||||
* Current size of slice data
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set by libavcodec
|
||||
*/
|
||||
uint32_t slice_data_size;
|
||||
};
|
||||
|
||||
/* @} */
|
||||
|
||||
#endif /* AVCODEC_VAAPI_H */
|
162
3rdparty/include/ffmpeg_/libavcodec/vda.h
vendored
162
3rdparty/include/ffmpeg_/libavcodec/vda.h
vendored
@ -1,162 +0,0 @@
|
||||
/*
|
||||
* VDA HW acceleration
|
||||
*
|
||||
* copyright (c) 2011 Sebastien Zwickert
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_VDA_H
|
||||
#define AVCODEC_VDA_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavc_codec_hwaccel_vda
|
||||
* Public libavcodec VDA header.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// emmintrin.h is unable to compile with -std=c99 -Werror=missing-prototypes
|
||||
// http://openradar.appspot.com/8026390
|
||||
#undef __GNUC_STDC_INLINE__
|
||||
|
||||
#define Picture QuickdrawPicture
|
||||
#include <VideoDecodeAcceleration/VDADecoder.h>
|
||||
#undef Picture
|
||||
|
||||
#include "libavcodec/version.h"
|
||||
|
||||
/**
|
||||
* @defgroup lavc_codec_hwaccel_vda VDA
|
||||
* @ingroup lavc_codec_hwaccel
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* This structure is used to provide the necessary configurations and data
|
||||
* to the VDA FFmpeg HWAccel implementation.
|
||||
*
|
||||
* The application must make it available as AVCodecContext.hwaccel_context.
|
||||
*/
|
||||
struct vda_context {
|
||||
/**
|
||||
* VDA decoder object.
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set/Unset by libavcodec.
|
||||
*/
|
||||
VDADecoder decoder;
|
||||
|
||||
/**
|
||||
* The Core Video pixel buffer that contains the current image data.
|
||||
*
|
||||
* encoding: unused
|
||||
* decoding: Set by libavcodec. Unset by user.
|
||||
*/
|
||||
CVPixelBufferRef cv_buffer;
|
||||
|
||||
/**
|
||||
* Use the hardware decoder in synchronous mode.
|
||||
*
|
||||
* encoding: unused
|
||||
* decoding: Set by user.
|
||||
*/
|
||||
int use_sync_decoding;
|
||||
|
||||
/**
|
||||
* The frame width.
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set/Unset by user.
|
||||
*/
|
||||
int width;
|
||||
|
||||
/**
|
||||
* The frame height.
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set/Unset by user.
|
||||
*/
|
||||
int height;
|
||||
|
||||
/**
|
||||
* The frame format.
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set/Unset by user.
|
||||
*/
|
||||
int format;
|
||||
|
||||
/**
|
||||
* The pixel format for output image buffers.
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set/Unset by user.
|
||||
*/
|
||||
OSType cv_pix_fmt_type;
|
||||
|
||||
/**
|
||||
* The current bitstream buffer.
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set/Unset by libavcodec.
|
||||
*/
|
||||
uint8_t *priv_bitstream;
|
||||
|
||||
/**
|
||||
* The current size of the bitstream.
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set/Unset by libavcodec.
|
||||
*/
|
||||
int priv_bitstream_size;
|
||||
|
||||
/**
|
||||
* The reference size used for fast reallocation.
|
||||
*
|
||||
* - encoding: unused
|
||||
* - decoding: Set/Unset by libavcodec.
|
||||
*/
|
||||
int priv_allocated_size;
|
||||
|
||||
/**
|
||||
* Use av_buffer to manage buffer.
|
||||
* When the flag is set, the CVPixelBuffers returned by the decoder will
|
||||
* be released automatically, so you have to retain them if necessary.
|
||||
* Not setting this flag may cause memory leak.
|
||||
*
|
||||
* encoding: unused
|
||||
* decoding: Set by user.
|
||||
*/
|
||||
int use_ref_buffer;
|
||||
};
|
||||
|
||||
/** Create the video decoder. */
|
||||
int ff_vda_create_decoder(struct vda_context *vda_ctx,
|
||||
uint8_t *extradata,
|
||||
int extradata_size);
|
||||
|
||||
/** Destroy the video decoder. */
|
||||
int ff_vda_destroy_decoder(struct vda_context *vda_ctx);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVCODEC_VDA_H */
|
159
3rdparty/include/ffmpeg_/libavcodec/vdpau.h
vendored
159
3rdparty/include/ffmpeg_/libavcodec/vdpau.h
vendored
@ -1,159 +0,0 @@
|
||||
/*
|
||||
* The Video Decode and Presentation API for UNIX (VDPAU) is used for
|
||||
* hardware-accelerated decoding of MPEG-1/2, H.264 and VC-1.
|
||||
*
|
||||
* Copyright (C) 2008 NVIDIA
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_VDPAU_H
|
||||
#define AVCODEC_VDPAU_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavc_codec_hwaccel_vdpau
|
||||
* Public libavcodec VDPAU header.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @defgroup lavc_codec_hwaccel_vdpau VDPAU Decoder and Renderer
|
||||
* @ingroup lavc_codec_hwaccel
|
||||
*
|
||||
* VDPAU hardware acceleration has two modules
|
||||
* - VDPAU decoding
|
||||
* - VDPAU presentation
|
||||
*
|
||||
* The VDPAU decoding module parses all headers using FFmpeg
|
||||
* parsing mechanisms and uses VDPAU for the actual decoding.
|
||||
*
|
||||
* As per the current implementation, the actual decoding
|
||||
* and rendering (API calls) are done as part of the VDPAU
|
||||
* presentation (vo_vdpau.c) module.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
#include <vdpau/vdpau.h>
|
||||
#include <vdpau/vdpau_x11.h>
|
||||
#include "libavutil/avconfig.h"
|
||||
|
||||
union FFVdpPictureInfo {
|
||||
VdpPictureInfoH264 h264;
|
||||
VdpPictureInfoMPEG1Or2 mpeg;
|
||||
VdpPictureInfoVC1 vc1;
|
||||
VdpPictureInfoMPEG4Part2 mpeg4;
|
||||
};
|
||||
|
||||
/**
|
||||
* This structure is used to share data between the libavcodec library and
|
||||
* the client video application.
|
||||
* The user shall zero-allocate the structure and make it available as
|
||||
* AVCodecContext.hwaccel_context. Members can be set by the user once
|
||||
* during initialization or through each AVCodecContext.get_buffer()
|
||||
* function call. In any case, they must be valid prior to calling
|
||||
* decoding functions.
|
||||
*/
|
||||
typedef struct AVVDPAUContext {
|
||||
/**
|
||||
* VDPAU decoder handle
|
||||
*
|
||||
* Set by user.
|
||||
*/
|
||||
VdpDecoder decoder;
|
||||
|
||||
/**
|
||||
* VDPAU decoder render callback
|
||||
*
|
||||
* Set by the user.
|
||||
*/
|
||||
VdpDecoderRender *render;
|
||||
|
||||
/**
|
||||
* VDPAU picture information
|
||||
*
|
||||
* Set by libavcodec.
|
||||
*/
|
||||
union FFVdpPictureInfo info;
|
||||
|
||||
/**
|
||||
* Allocated size of the bitstream_buffers table.
|
||||
*
|
||||
* Set by libavcodec.
|
||||
*/
|
||||
int bitstream_buffers_allocated;
|
||||
|
||||
/**
|
||||
* Useful bitstream buffers in the bitstream buffers table.
|
||||
*
|
||||
* Set by libavcodec.
|
||||
*/
|
||||
int bitstream_buffers_used;
|
||||
|
||||
/**
|
||||
* Table of bitstream buffers.
|
||||
* The user is responsible for freeing this buffer using av_freep().
|
||||
*
|
||||
* Set by libavcodec.
|
||||
*/
|
||||
VdpBitstreamBuffer *bitstream_buffers;
|
||||
} AVVDPAUContext;
|
||||
|
||||
|
||||
/** @brief The videoSurface is used for rendering. */
|
||||
#define FF_VDPAU_STATE_USED_FOR_RENDER 1
|
||||
|
||||
/**
|
||||
* @brief The videoSurface is needed for reference/prediction.
|
||||
* The codec manipulates this.
|
||||
*/
|
||||
#define FF_VDPAU_STATE_USED_FOR_REFERENCE 2
|
||||
|
||||
/**
|
||||
* @brief This structure is used as a callback between the FFmpeg
|
||||
* decoder (vd_) and presentation (vo_) module.
|
||||
* This is used for defining a video frame containing surface,
|
||||
* picture parameter, bitstream information etc which are passed
|
||||
* between the FFmpeg decoder and its clients.
|
||||
*/
|
||||
struct vdpau_render_state {
|
||||
VdpVideoSurface surface; ///< Used as rendered surface, never changed.
|
||||
|
||||
int state; ///< Holds FF_VDPAU_STATE_* values.
|
||||
|
||||
#if AV_HAVE_INCOMPATIBLE_LIBAV_ABI
|
||||
/** picture parameter information for all supported codecs */
|
||||
union FFVdpPictureInfo info;
|
||||
#endif
|
||||
|
||||
/** Describe size/location of the compressed video data.
|
||||
Set to 0 when freeing bitstream_buffers. */
|
||||
int bitstream_buffers_allocated;
|
||||
int bitstream_buffers_used;
|
||||
/** The user is responsible for freeing this buffer using av_freep(). */
|
||||
VdpBitstreamBuffer *bitstream_buffers;
|
||||
|
||||
#if !AV_HAVE_INCOMPATIBLE_LIBAV_ABI
|
||||
/** picture parameter information for all supported codecs */
|
||||
union FFVdpPictureInfo info;
|
||||
#endif
|
||||
};
|
||||
|
||||
/* @}*/
|
||||
|
||||
#endif /* AVCODEC_VDPAU_H */
|
95
3rdparty/include/ffmpeg_/libavcodec/version.h
vendored
95
3rdparty/include/ffmpeg_/libavcodec/version.h
vendored
@ -1,95 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_VERSION_H
|
||||
#define AVCODEC_VERSION_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup libavc
|
||||
* Libavcodec version macros.
|
||||
*/
|
||||
|
||||
#include "libavutil/avutil.h"
|
||||
|
||||
#define LIBAVCODEC_VERSION_MAJOR 55
|
||||
#define LIBAVCODEC_VERSION_MINOR 18
|
||||
#define LIBAVCODEC_VERSION_MICRO 102
|
||||
|
||||
#define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
|
||||
LIBAVCODEC_VERSION_MINOR, \
|
||||
LIBAVCODEC_VERSION_MICRO)
|
||||
#define LIBAVCODEC_VERSION AV_VERSION(LIBAVCODEC_VERSION_MAJOR, \
|
||||
LIBAVCODEC_VERSION_MINOR, \
|
||||
LIBAVCODEC_VERSION_MICRO)
|
||||
#define LIBAVCODEC_BUILD LIBAVCODEC_VERSION_INT
|
||||
|
||||
#define LIBAVCODEC_IDENT "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION)
|
||||
|
||||
/**
|
||||
* FF_API_* defines may be placed below to indicate public API that will be
|
||||
* dropped at a future version bump. The defines themselves are not part of
|
||||
* the public API and may change, break or disappear at any time.
|
||||
*/
|
||||
|
||||
#ifndef FF_API_REQUEST_CHANNELS
|
||||
#define FF_API_REQUEST_CHANNELS (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_ALLOC_CONTEXT
|
||||
#define FF_API_ALLOC_CONTEXT (LIBAVCODEC_VERSION_MAJOR < 55)
|
||||
#endif
|
||||
#ifndef FF_API_AVCODEC_OPEN
|
||||
#define FF_API_AVCODEC_OPEN (LIBAVCODEC_VERSION_MAJOR < 55)
|
||||
#endif
|
||||
#ifndef FF_API_OLD_DECODE_AUDIO
|
||||
#define FF_API_OLD_DECODE_AUDIO (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_OLD_TIMECODE
|
||||
#define FF_API_OLD_TIMECODE (LIBAVCODEC_VERSION_MAJOR < 55)
|
||||
#endif
|
||||
|
||||
#ifndef FF_API_OLD_ENCODE_AUDIO
|
||||
#define FF_API_OLD_ENCODE_AUDIO (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_OLD_ENCODE_VIDEO
|
||||
#define FF_API_OLD_ENCODE_VIDEO (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_CODEC_ID
|
||||
#define FF_API_CODEC_ID (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_AVCODEC_RESAMPLE
|
||||
#define FF_API_AVCODEC_RESAMPLE (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_DEINTERLACE
|
||||
#define FF_API_DEINTERLACE (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_DESTRUCT_PACKET
|
||||
#define FF_API_DESTRUCT_PACKET (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_GET_BUFFER
|
||||
#define FF_API_GET_BUFFER (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_MISSING_SAMPLE
|
||||
#define FF_API_MISSING_SAMPLE (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_LOWRES
|
||||
#define FF_API_LOWRES (LIBAVCODEC_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
|
||||
#endif /* AVCODEC_VERSION_H */
|
168
3rdparty/include/ffmpeg_/libavcodec/xvmc.h
vendored
168
3rdparty/include/ffmpeg_/libavcodec/xvmc.h
vendored
@ -1,168 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Ivan Kalvachev
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_XVMC_H
|
||||
#define AVCODEC_XVMC_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavc_codec_hwaccel_xvmc
|
||||
* Public libavcodec XvMC header.
|
||||
*/
|
||||
|
||||
#include <X11/extensions/XvMC.h>
|
||||
|
||||
#include "avcodec.h"
|
||||
|
||||
/**
|
||||
* @defgroup lavc_codec_hwaccel_xvmc XvMC
|
||||
* @ingroup lavc_codec_hwaccel
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
#define AV_XVMC_ID 0x1DC711C0 /**< special value to ensure that regular pixel routines haven't corrupted the struct
|
||||
the number is 1337 speak for the letters IDCT MCo (motion compensation) */
|
||||
|
||||
struct xvmc_pix_fmt {
|
||||
/** The field contains the special constant value AV_XVMC_ID.
|
||||
It is used as a test that the application correctly uses the API,
|
||||
and that there is no corruption caused by pixel routines.
|
||||
- application - set during initialization
|
||||
- libavcodec - unchanged
|
||||
*/
|
||||
int xvmc_id;
|
||||
|
||||
/** Pointer to the block array allocated by XvMCCreateBlocks().
|
||||
The array has to be freed by XvMCDestroyBlocks().
|
||||
Each group of 64 values represents one data block of differential
|
||||
pixel information (in MoCo mode) or coefficients for IDCT.
|
||||
- application - set the pointer during initialization
|
||||
- libavcodec - fills coefficients/pixel data into the array
|
||||
*/
|
||||
short* data_blocks;
|
||||
|
||||
/** Pointer to the macroblock description array allocated by
|
||||
XvMCCreateMacroBlocks() and freed by XvMCDestroyMacroBlocks().
|
||||
- application - set the pointer during initialization
|
||||
- libavcodec - fills description data into the array
|
||||
*/
|
||||
XvMCMacroBlock* mv_blocks;
|
||||
|
||||
/** Number of macroblock descriptions that can be stored in the mv_blocks
|
||||
array.
|
||||
- application - set during initialization
|
||||
- libavcodec - unchanged
|
||||
*/
|
||||
int allocated_mv_blocks;
|
||||
|
||||
/** Number of blocks that can be stored at once in the data_blocks array.
|
||||
- application - set during initialization
|
||||
- libavcodec - unchanged
|
||||
*/
|
||||
int allocated_data_blocks;
|
||||
|
||||
/** Indicate that the hardware would interpret data_blocks as IDCT
|
||||
coefficients and perform IDCT on them.
|
||||
- application - set during initialization
|
||||
- libavcodec - unchanged
|
||||
*/
|
||||
int idct;
|
||||
|
||||
/** In MoCo mode it indicates that intra macroblocks are assumed to be in
|
||||
unsigned format; same as the XVMC_INTRA_UNSIGNED flag.
|
||||
- application - set during initialization
|
||||
- libavcodec - unchanged
|
||||
*/
|
||||
int unsigned_intra;
|
||||
|
||||
/** Pointer to the surface allocated by XvMCCreateSurface().
|
||||
It has to be freed by XvMCDestroySurface() on application exit.
|
||||
It identifies the frame and its state on the video hardware.
|
||||
- application - set during initialization
|
||||
- libavcodec - unchanged
|
||||
*/
|
||||
XvMCSurface* p_surface;
|
||||
|
||||
/** Set by the decoder before calling ff_draw_horiz_band(),
|
||||
needed by the XvMCRenderSurface function. */
|
||||
//@{
|
||||
/** Pointer to the surface used as past reference
|
||||
- application - unchanged
|
||||
- libavcodec - set
|
||||
*/
|
||||
XvMCSurface* p_past_surface;
|
||||
|
||||
/** Pointer to the surface used as future reference
|
||||
- application - unchanged
|
||||
- libavcodec - set
|
||||
*/
|
||||
XvMCSurface* p_future_surface;
|
||||
|
||||
/** top/bottom field or frame
|
||||
- application - unchanged
|
||||
- libavcodec - set
|
||||
*/
|
||||
unsigned int picture_structure;
|
||||
|
||||
/** XVMC_SECOND_FIELD - 1st or 2nd field in the sequence
|
||||
- application - unchanged
|
||||
- libavcodec - set
|
||||
*/
|
||||
unsigned int flags;
|
||||
//}@
|
||||
|
||||
/** Number of macroblock descriptions in the mv_blocks array
|
||||
that have already been passed to the hardware.
|
||||
- application - zeroes it on get_buffer().
|
||||
A successful ff_draw_horiz_band() may increment it
|
||||
with filled_mb_block_num or zero both.
|
||||
- libavcodec - unchanged
|
||||
*/
|
||||
int start_mv_blocks_num;
|
||||
|
||||
/** Number of new macroblock descriptions in the mv_blocks array (after
|
||||
start_mv_blocks_num) that are filled by libavcodec and have to be
|
||||
passed to the hardware.
|
||||
- application - zeroes it on get_buffer() or after successful
|
||||
ff_draw_horiz_band().
|
||||
- libavcodec - increment with one of each stored MB
|
||||
*/
|
||||
int filled_mv_blocks_num;
|
||||
|
||||
/** Number of the next free data block; one data block consists of
|
||||
64 short values in the data_blocks array.
|
||||
All blocks before this one have already been claimed by placing their
|
||||
position into the corresponding block description structure field,
|
||||
that are part of the mv_blocks array.
|
||||
- application - zeroes it on get_buffer().
|
||||
A successful ff_draw_horiz_band() may zero it together
|
||||
with start_mb_blocks_num.
|
||||
- libavcodec - each decoded macroblock increases it by the number
|
||||
of coded blocks it contains.
|
||||
*/
|
||||
int next_free_data_block_num;
|
||||
};
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVCODEC_XVMC_H */
|
69
3rdparty/include/ffmpeg_/libavdevice/avdevice.h
vendored
69
3rdparty/include/ffmpeg_/libavdevice/avdevice.h
vendored
@ -1,69 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVDEVICE_AVDEVICE_H
|
||||
#define AVDEVICE_AVDEVICE_H
|
||||
|
||||
#include "version.h"
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavd
|
||||
* Main libavdevice API header
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup lavd Special devices muxing/demuxing library
|
||||
* @{
|
||||
* Libavdevice is a complementary library to @ref libavf "libavformat". It
|
||||
* provides various "special" platform-specific muxers and demuxers, e.g. for
|
||||
* grabbing devices, audio capture and playback etc. As a consequence, the
|
||||
* (de)muxers in libavdevice are of the AVFMT_NOFILE type (they use their own
|
||||
* I/O functions). The filename passed to avformat_open_input() often does not
|
||||
* refer to an actually existing file, but has some special device-specific
|
||||
* meaning - e.g. for x11grab it is the display name.
|
||||
*
|
||||
* To use libavdevice, simply call avdevice_register_all() to register all
|
||||
* compiled muxers and demuxers. They all use standard libavformat API.
|
||||
* @}
|
||||
*/
|
||||
|
||||
#include "libavformat/avformat.h"
|
||||
|
||||
/**
|
||||
* Return the LIBAVDEVICE_VERSION_INT constant.
|
||||
*/
|
||||
unsigned avdevice_version(void);
|
||||
|
||||
/**
|
||||
* Return the libavdevice build-time configuration.
|
||||
*/
|
||||
const char *avdevice_configuration(void);
|
||||
|
||||
/**
|
||||
* Return the libavdevice license.
|
||||
*/
|
||||
const char *avdevice_license(void);
|
||||
|
||||
/**
|
||||
* Initialize libavdevice and register all the input and output devices.
|
||||
* @warning This function is not thread safe.
|
||||
*/
|
||||
void avdevice_register_all(void);
|
||||
|
||||
#endif /* AVDEVICE_AVDEVICE_H */
|
50
3rdparty/include/ffmpeg_/libavdevice/version.h
vendored
50
3rdparty/include/ffmpeg_/libavdevice/version.h
vendored
@ -1,50 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVDEVICE_VERSION_H
|
||||
#define AVDEVICE_VERSION_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavd
|
||||
* Libavdevice version macros
|
||||
*/
|
||||
|
||||
#include "libavutil/avutil.h"
|
||||
|
||||
#define LIBAVDEVICE_VERSION_MAJOR 55
|
||||
#define LIBAVDEVICE_VERSION_MINOR 3
|
||||
#define LIBAVDEVICE_VERSION_MICRO 100
|
||||
|
||||
#define LIBAVDEVICE_VERSION_INT AV_VERSION_INT(LIBAVDEVICE_VERSION_MAJOR, \
|
||||
LIBAVDEVICE_VERSION_MINOR, \
|
||||
LIBAVDEVICE_VERSION_MICRO)
|
||||
#define LIBAVDEVICE_VERSION AV_VERSION(LIBAVDEVICE_VERSION_MAJOR, \
|
||||
LIBAVDEVICE_VERSION_MINOR, \
|
||||
LIBAVDEVICE_VERSION_MICRO)
|
||||
#define LIBAVDEVICE_BUILD LIBAVDEVICE_VERSION_INT
|
||||
|
||||
#define LIBAVDEVICE_IDENT "Lavd" AV_STRINGIFY(LIBAVDEVICE_VERSION)
|
||||
|
||||
/**
|
||||
* FF_API_* defines may be placed below to indicate public API that will be
|
||||
* dropped at a future version bump. The defines themselves are not part of
|
||||
* the public API and may change, break or disappear at any time.
|
||||
*/
|
||||
|
||||
#endif /* AVDEVICE_VERSION_H */
|
2181
3rdparty/include/ffmpeg_/libavformat/avformat.h
vendored
2181
3rdparty/include/ffmpeg_/libavformat/avformat.h
vendored
File diff suppressed because it is too large
Load Diff
481
3rdparty/include/ffmpeg_/libavformat/avio.h
vendored
481
3rdparty/include/ffmpeg_/libavformat/avio.h
vendored
@ -1,481 +0,0 @@
|
||||
/*
|
||||
* copyright (c) 2001 Fabrice Bellard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#ifndef AVFORMAT_AVIO_H
|
||||
#define AVFORMAT_AVIO_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup lavf_io
|
||||
* Buffered I/O operations
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/dict.h"
|
||||
#include "libavutil/log.h"
|
||||
|
||||
#include "libavformat/version.h"
|
||||
|
||||
|
||||
#define AVIO_SEEKABLE_NORMAL 0x0001 /**< Seeking works like for a local file */
|
||||
|
||||
/**
|
||||
* Callback for checking whether to abort blocking functions.
|
||||
* AVERROR_EXIT is returned in this case by the interrupted
|
||||
* function. During blocking operations, callback is called with
|
||||
* opaque as parameter. If the callback returns 1, the
|
||||
* blocking operation will be aborted.
|
||||
*
|
||||
* No members can be added to this struct without a major bump, if
|
||||
* new elements have been added after this struct in AVFormatContext
|
||||
* or AVIOContext.
|
||||
*/
|
||||
typedef struct AVIOInterruptCB {
|
||||
int (*callback)(void*);
|
||||
void *opaque;
|
||||
} AVIOInterruptCB;
|
||||
|
||||
/**
|
||||
* Bytestream IO Context.
|
||||
* New fields can be added to the end with minor version bumps.
|
||||
* Removal, reordering and changes to existing fields require a major
|
||||
* version bump.
|
||||
* sizeof(AVIOContext) must not be used outside libav*.
|
||||
*
|
||||
* @note None of the function pointers in AVIOContext should be called
|
||||
* directly, they should only be set by the client application
|
||||
* when implementing custom I/O. Normally these are set to the
|
||||
* function pointers specified in avio_alloc_context()
|
||||
*/
|
||||
typedef struct AVIOContext {
|
||||
/**
|
||||
* A class for private options.
|
||||
*
|
||||
* If this AVIOContext is created by avio_open2(), av_class is set and
|
||||
* passes the options down to protocols.
|
||||
*
|
||||
* If this AVIOContext is manually allocated, then av_class may be set by
|
||||
* the caller.
|
||||
*
|
||||
* warning -- this field can be NULL, be sure to not pass this AVIOContext
|
||||
* to any av_opt_* functions in that case.
|
||||
*/
|
||||
const AVClass *av_class;
|
||||
unsigned char *buffer; /**< Start of the buffer. */
|
||||
int buffer_size; /**< Maximum buffer size */
|
||||
unsigned char *buf_ptr; /**< Current position in the buffer */
|
||||
unsigned char *buf_end; /**< End of the data, may be less than
|
||||
buffer+buffer_size if the read function returned
|
||||
less data than requested, e.g. for streams where
|
||||
no more data has been received yet. */
|
||||
void *opaque; /**< A private pointer, passed to the read/write/seek/...
|
||||
functions. */
|
||||
int (*read_packet)(void *opaque, uint8_t *buf, int buf_size);
|
||||
int (*write_packet)(void *opaque, uint8_t *buf, int buf_size);
|
||||
int64_t (*seek)(void *opaque, int64_t offset, int whence);
|
||||
int64_t pos; /**< position in the file of the current buffer */
|
||||
int must_flush; /**< true if the next seek should flush */
|
||||
int eof_reached; /**< true if eof reached */
|
||||
int write_flag; /**< true if open for writing */
|
||||
int max_packet_size;
|
||||
unsigned long checksum;
|
||||
unsigned char *checksum_ptr;
|
||||
unsigned long (*update_checksum)(unsigned long checksum, const uint8_t *buf, unsigned int size);
|
||||
int error; /**< contains the error code or 0 if no error happened */
|
||||
/**
|
||||
* Pause or resume playback for network streaming protocols - e.g. MMS.
|
||||
*/
|
||||
int (*read_pause)(void *opaque, int pause);
|
||||
/**
|
||||
* Seek to a given timestamp in stream with the specified stream_index.
|
||||
* Needed for some network streaming protocols which don't support seeking
|
||||
* to byte position.
|
||||
*/
|
||||
int64_t (*read_seek)(void *opaque, int stream_index,
|
||||
int64_t timestamp, int flags);
|
||||
/**
|
||||
* A combination of AVIO_SEEKABLE_ flags or 0 when the stream is not seekable.
|
||||
*/
|
||||
int seekable;
|
||||
|
||||
/**
|
||||
* max filesize, used to limit allocations
|
||||
* This field is internal to libavformat and access from outside is not allowed.
|
||||
*/
|
||||
int64_t maxsize;
|
||||
|
||||
/**
|
||||
* avio_read and avio_write should if possible be satisfied directly
|
||||
* instead of going through a buffer, and avio_seek will always
|
||||
* call the underlying seek function directly.
|
||||
*/
|
||||
int direct;
|
||||
|
||||
/**
|
||||
* Bytes read statistic
|
||||
* This field is internal to libavformat and access from outside is not allowed.
|
||||
*/
|
||||
int64_t bytes_read;
|
||||
|
||||
/**
|
||||
* seek statistic
|
||||
* This field is internal to libavformat and access from outside is not allowed.
|
||||
*/
|
||||
int seek_count;
|
||||
|
||||
/**
|
||||
* writeout statistic
|
||||
* This field is internal to libavformat and access from outside is not allowed.
|
||||
*/
|
||||
int writeout_count;
|
||||
} AVIOContext;
|
||||
|
||||
/* unbuffered I/O */
|
||||
|
||||
/**
|
||||
* Return AVIO_FLAG_* access flags corresponding to the access permissions
|
||||
* of the resource in url, or a negative value corresponding to an
|
||||
* AVERROR code in case of failure. The returned access flags are
|
||||
* masked by the value in flags.
|
||||
*
|
||||
* @note This function is intrinsically unsafe, in the sense that the
|
||||
* checked resource may change its existence or permission status from
|
||||
* one call to another. Thus you should not trust the returned value,
|
||||
* unless you are sure that no other processes are accessing the
|
||||
* checked resource.
|
||||
*/
|
||||
int avio_check(const char *url, int flags);
|
||||
|
||||
/**
|
||||
* Allocate and initialize an AVIOContext for buffered I/O. It must be later
|
||||
* freed with av_free().
|
||||
*
|
||||
* @param buffer Memory block for input/output operations via AVIOContext.
|
||||
* The buffer must be allocated with av_malloc() and friends.
|
||||
* @param buffer_size The buffer size is very important for performance.
|
||||
* For protocols with fixed blocksize it should be set to this blocksize.
|
||||
* For others a typical size is a cache page, e.g. 4kb.
|
||||
* @param write_flag Set to 1 if the buffer should be writable, 0 otherwise.
|
||||
* @param opaque An opaque pointer to user-specific data.
|
||||
* @param read_packet A function for refilling the buffer, may be NULL.
|
||||
* @param write_packet A function for writing the buffer contents, may be NULL.
|
||||
* The function may not change the input buffers content.
|
||||
* @param seek A function for seeking to specified byte position, may be NULL.
|
||||
*
|
||||
* @return Allocated AVIOContext or NULL on failure.
|
||||
*/
|
||||
AVIOContext *avio_alloc_context(
|
||||
unsigned char *buffer,
|
||||
int buffer_size,
|
||||
int write_flag,
|
||||
void *opaque,
|
||||
int (*read_packet)(void *opaque, uint8_t *buf, int buf_size),
|
||||
int (*write_packet)(void *opaque, uint8_t *buf, int buf_size),
|
||||
int64_t (*seek)(void *opaque, int64_t offset, int whence));
|
||||
|
||||
void avio_w8(AVIOContext *s, int b);
|
||||
void avio_write(AVIOContext *s, const unsigned char *buf, int size);
|
||||
void avio_wl64(AVIOContext *s, uint64_t val);
|
||||
void avio_wb64(AVIOContext *s, uint64_t val);
|
||||
void avio_wl32(AVIOContext *s, unsigned int val);
|
||||
void avio_wb32(AVIOContext *s, unsigned int val);
|
||||
void avio_wl24(AVIOContext *s, unsigned int val);
|
||||
void avio_wb24(AVIOContext *s, unsigned int val);
|
||||
void avio_wl16(AVIOContext *s, unsigned int val);
|
||||
void avio_wb16(AVIOContext *s, unsigned int val);
|
||||
|
||||
/**
|
||||
* Write a NULL-terminated string.
|
||||
* @return number of bytes written.
|
||||
*/
|
||||
int avio_put_str(AVIOContext *s, const char *str);
|
||||
|
||||
/**
|
||||
* Convert an UTF-8 string to UTF-16LE and write it.
|
||||
* @return number of bytes written.
|
||||
*/
|
||||
int avio_put_str16le(AVIOContext *s, const char *str);
|
||||
|
||||
/**
|
||||
* Passing this as the "whence" parameter to a seek function causes it to
|
||||
* return the filesize without seeking anywhere. Supporting this is optional.
|
||||
* If it is not supported then the seek function will return <0.
|
||||
*/
|
||||
#define AVSEEK_SIZE 0x10000
|
||||
|
||||
/**
|
||||
* Oring this flag as into the "whence" parameter to a seek function causes it to
|
||||
* seek by any means (like reopening and linear reading) or other normally unreasonable
|
||||
* means that can be extremely slow.
|
||||
* This may be ignored by the seek code.
|
||||
*/
|
||||
#define AVSEEK_FORCE 0x20000
|
||||
|
||||
/**
|
||||
* fseek() equivalent for AVIOContext.
|
||||
* @return new position or AVERROR.
|
||||
*/
|
||||
int64_t avio_seek(AVIOContext *s, int64_t offset, int whence);
|
||||
|
||||
/**
|
||||
* Skip given number of bytes forward
|
||||
* @return new position or AVERROR.
|
||||
*/
|
||||
int64_t avio_skip(AVIOContext *s, int64_t offset);
|
||||
|
||||
/**
|
||||
* ftell() equivalent for AVIOContext.
|
||||
* @return position or AVERROR.
|
||||
*/
|
||||
static av_always_inline int64_t avio_tell(AVIOContext *s)
|
||||
{
|
||||
return avio_seek(s, 0, SEEK_CUR);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the filesize.
|
||||
* @return filesize or AVERROR
|
||||
*/
|
||||
int64_t avio_size(AVIOContext *s);
|
||||
|
||||
/**
|
||||
* feof() equivalent for AVIOContext.
|
||||
* @return non zero if and only if end of file
|
||||
*/
|
||||
int url_feof(AVIOContext *s);
|
||||
|
||||
/** @warning currently size is limited */
|
||||
int avio_printf(AVIOContext *s, const char *fmt, ...) av_printf_format(2, 3);
|
||||
|
||||
/**
|
||||
* Force flushing of buffered data to the output s.
|
||||
*
|
||||
* Force the buffered data to be immediately written to the output,
|
||||
* without to wait to fill the internal buffer.
|
||||
*/
|
||||
void avio_flush(AVIOContext *s);
|
||||
|
||||
/**
|
||||
* Read size bytes from AVIOContext into buf.
|
||||
* @return number of bytes read or AVERROR
|
||||
*/
|
||||
int avio_read(AVIOContext *s, unsigned char *buf, int size);
|
||||
|
||||
/**
|
||||
* @name Functions for reading from AVIOContext
|
||||
* @{
|
||||
*
|
||||
* @note return 0 if EOF, so you cannot use it if EOF handling is
|
||||
* necessary
|
||||
*/
|
||||
int avio_r8 (AVIOContext *s);
|
||||
unsigned int avio_rl16(AVIOContext *s);
|
||||
unsigned int avio_rl24(AVIOContext *s);
|
||||
unsigned int avio_rl32(AVIOContext *s);
|
||||
uint64_t avio_rl64(AVIOContext *s);
|
||||
unsigned int avio_rb16(AVIOContext *s);
|
||||
unsigned int avio_rb24(AVIOContext *s);
|
||||
unsigned int avio_rb32(AVIOContext *s);
|
||||
uint64_t avio_rb64(AVIOContext *s);
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Read a string from pb into buf. The reading will terminate when either
|
||||
* a NULL character was encountered, maxlen bytes have been read, or nothing
|
||||
* more can be read from pb. The result is guaranteed to be NULL-terminated, it
|
||||
* will be truncated if buf is too small.
|
||||
* Note that the string is not interpreted or validated in any way, it
|
||||
* might get truncated in the middle of a sequence for multi-byte encodings.
|
||||
*
|
||||
* @return number of bytes read (is always <= maxlen).
|
||||
* If reading ends on EOF or error, the return value will be one more than
|
||||
* bytes actually read.
|
||||
*/
|
||||
int avio_get_str(AVIOContext *pb, int maxlen, char *buf, int buflen);
|
||||
|
||||
/**
|
||||
* Read a UTF-16 string from pb and convert it to UTF-8.
|
||||
* The reading will terminate when either a null or invalid character was
|
||||
* encountered or maxlen bytes have been read.
|
||||
* @return number of bytes read (is always <= maxlen)
|
||||
*/
|
||||
int avio_get_str16le(AVIOContext *pb, int maxlen, char *buf, int buflen);
|
||||
int avio_get_str16be(AVIOContext *pb, int maxlen, char *buf, int buflen);
|
||||
|
||||
|
||||
/**
|
||||
* @name URL open modes
|
||||
* The flags argument to avio_open must be one of the following
|
||||
* constants, optionally ORed with other flags.
|
||||
* @{
|
||||
*/
|
||||
#define AVIO_FLAG_READ 1 /**< read-only */
|
||||
#define AVIO_FLAG_WRITE 2 /**< write-only */
|
||||
#define AVIO_FLAG_READ_WRITE (AVIO_FLAG_READ|AVIO_FLAG_WRITE) /**< read-write pseudo flag */
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Use non-blocking mode.
|
||||
* If this flag is set, operations on the context will return
|
||||
* AVERROR(EAGAIN) if they can not be performed immediately.
|
||||
* If this flag is not set, operations on the context will never return
|
||||
* AVERROR(EAGAIN).
|
||||
* Note that this flag does not affect the opening/connecting of the
|
||||
* context. Connecting a protocol will always block if necessary (e.g. on
|
||||
* network protocols) but never hang (e.g. on busy devices).
|
||||
* Warning: non-blocking protocols is work-in-progress; this flag may be
|
||||
* silently ignored.
|
||||
*/
|
||||
#define AVIO_FLAG_NONBLOCK 8
|
||||
|
||||
/**
|
||||
* Use direct mode.
|
||||
* avio_read and avio_write should if possible be satisfied directly
|
||||
* instead of going through a buffer, and avio_seek will always
|
||||
* call the underlying seek function directly.
|
||||
*/
|
||||
#define AVIO_FLAG_DIRECT 0x8000
|
||||
|
||||
/**
|
||||
* Create and initialize a AVIOContext for accessing the
|
||||
* resource indicated by url.
|
||||
* @note When the resource indicated by url has been opened in
|
||||
* read+write mode, the AVIOContext can be used only for writing.
|
||||
*
|
||||
* @param s Used to return the pointer to the created AVIOContext.
|
||||
* In case of failure the pointed to value is set to NULL.
|
||||
* @param flags flags which control how the resource indicated by url
|
||||
* is to be opened
|
||||
* @return 0 in case of success, a negative value corresponding to an
|
||||
* AVERROR code in case of failure
|
||||
*/
|
||||
int avio_open(AVIOContext **s, const char *url, int flags);
|
||||
|
||||
/**
|
||||
* Create and initialize a AVIOContext for accessing the
|
||||
* resource indicated by url.
|
||||
* @note When the resource indicated by url has been opened in
|
||||
* read+write mode, the AVIOContext can be used only for writing.
|
||||
*
|
||||
* @param s Used to return the pointer to the created AVIOContext.
|
||||
* In case of failure the pointed to value is set to NULL.
|
||||
* @param flags flags which control how the resource indicated by url
|
||||
* is to be opened
|
||||
* @param int_cb an interrupt callback to be used at the protocols level
|
||||
* @param options A dictionary filled with protocol-private options. On return
|
||||
* this parameter will be destroyed and replaced with a dict containing options
|
||||
* that were not found. May be NULL.
|
||||
* @return 0 in case of success, a negative value corresponding to an
|
||||
* AVERROR code in case of failure
|
||||
*/
|
||||
int avio_open2(AVIOContext **s, const char *url, int flags,
|
||||
const AVIOInterruptCB *int_cb, AVDictionary **options);
|
||||
|
||||
/**
|
||||
* Close the resource accessed by the AVIOContext s and free it.
|
||||
* This function can only be used if s was opened by avio_open().
|
||||
*
|
||||
* The internal buffer is automatically flushed before closing the
|
||||
* resource.
|
||||
*
|
||||
* @return 0 on success, an AVERROR < 0 on error.
|
||||
* @see avio_closep
|
||||
*/
|
||||
int avio_close(AVIOContext *s);
|
||||
|
||||
/**
|
||||
* Close the resource accessed by the AVIOContext *s, free it
|
||||
* and set the pointer pointing to it to NULL.
|
||||
* This function can only be used if s was opened by avio_open().
|
||||
*
|
||||
* The internal buffer is automatically flushed before closing the
|
||||
* resource.
|
||||
*
|
||||
* @return 0 on success, an AVERROR < 0 on error.
|
||||
* @see avio_close
|
||||
*/
|
||||
int avio_closep(AVIOContext **s);
|
||||
|
||||
|
||||
/**
|
||||
* Open a write only memory stream.
|
||||
*
|
||||
* @param s new IO context
|
||||
* @return zero if no error.
|
||||
*/
|
||||
int avio_open_dyn_buf(AVIOContext **s);
|
||||
|
||||
/**
|
||||
* Return the written size and a pointer to the buffer. The buffer
|
||||
* must be freed with av_free().
|
||||
* Padding of FF_INPUT_BUFFER_PADDING_SIZE is added to the buffer.
|
||||
*
|
||||
* @param s IO context
|
||||
* @param pbuffer pointer to a byte buffer
|
||||
* @return the length of the byte buffer
|
||||
*/
|
||||
int avio_close_dyn_buf(AVIOContext *s, uint8_t **pbuffer);
|
||||
|
||||
/**
|
||||
* Iterate through names of available protocols.
|
||||
*
|
||||
* @param opaque A private pointer representing current protocol.
|
||||
* It must be a pointer to NULL on first iteration and will
|
||||
* be updated by successive calls to avio_enum_protocols.
|
||||
* @param output If set to 1, iterate over output protocols,
|
||||
* otherwise over input protocols.
|
||||
*
|
||||
* @return A static string containing the name of current protocol or NULL
|
||||
*/
|
||||
const char *avio_enum_protocols(void **opaque, int output);
|
||||
|
||||
/**
|
||||
* Pause and resume playing - only meaningful if using a network streaming
|
||||
* protocol (e.g. MMS).
|
||||
* @param pause 1 for pause, 0 for resume
|
||||
*/
|
||||
int avio_pause(AVIOContext *h, int pause);
|
||||
|
||||
/**
|
||||
* Seek to a given timestamp relative to some component stream.
|
||||
* Only meaningful if using a network streaming protocol (e.g. MMS.).
|
||||
* @param stream_index The stream index that the timestamp is relative to.
|
||||
* If stream_index is (-1) the timestamp should be in AV_TIME_BASE
|
||||
* units from the beginning of the presentation.
|
||||
* If a stream_index >= 0 is used and the protocol does not support
|
||||
* seeking based on component streams, the call will fail.
|
||||
* @param timestamp timestamp in AVStream.time_base units
|
||||
* or if there is no stream specified then in AV_TIME_BASE units.
|
||||
* @param flags Optional combination of AVSEEK_FLAG_BACKWARD, AVSEEK_FLAG_BYTE
|
||||
* and AVSEEK_FLAG_ANY. The protocol may silently ignore
|
||||
* AVSEEK_FLAG_BACKWARD and AVSEEK_FLAG_ANY, but AVSEEK_FLAG_BYTE will
|
||||
* fail if used and not supported.
|
||||
* @return >= 0 on success
|
||||
* @see AVInputFormat::read_seek
|
||||
*/
|
||||
int64_t avio_seek_time(AVIOContext *h, int stream_index,
|
||||
int64_t timestamp, int flags);
|
||||
|
||||
#endif /* AVFORMAT_AVIO_H */
|
82
3rdparty/include/ffmpeg_/libavformat/version.h
vendored
82
3rdparty/include/ffmpeg_/libavformat/version.h
vendored
@ -1,82 +0,0 @@
|
||||
/*
|
||||
* Version macros.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVFORMAT_VERSION_H
|
||||
#define AVFORMAT_VERSION_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @ingroup libavf
|
||||
* Libavformat version macros
|
||||
*/
|
||||
|
||||
#include "libavutil/avutil.h"
|
||||
|
||||
#define LIBAVFORMAT_VERSION_MAJOR 55
|
||||
#define LIBAVFORMAT_VERSION_MINOR 12
|
||||
#define LIBAVFORMAT_VERSION_MICRO 100
|
||||
|
||||
#define LIBAVFORMAT_VERSION_INT AV_VERSION_INT(LIBAVFORMAT_VERSION_MAJOR, \
|
||||
LIBAVFORMAT_VERSION_MINOR, \
|
||||
LIBAVFORMAT_VERSION_MICRO)
|
||||
#define LIBAVFORMAT_VERSION AV_VERSION(LIBAVFORMAT_VERSION_MAJOR, \
|
||||
LIBAVFORMAT_VERSION_MINOR, \
|
||||
LIBAVFORMAT_VERSION_MICRO)
|
||||
#define LIBAVFORMAT_BUILD LIBAVFORMAT_VERSION_INT
|
||||
|
||||
#define LIBAVFORMAT_IDENT "Lavf" AV_STRINGIFY(LIBAVFORMAT_VERSION)
|
||||
|
||||
/**
|
||||
* FF_API_* defines may be placed below to indicate public API that will be
|
||||
* dropped at a future version bump. The defines themselves are not part of
|
||||
* the public API and may change, break or disappear at any time.
|
||||
*/
|
||||
|
||||
#ifndef FF_API_OLD_AVIO
|
||||
#define FF_API_OLD_AVIO (LIBAVFORMAT_VERSION_MAJOR < 55)
|
||||
#endif
|
||||
#ifndef FF_API_PKT_DUMP
|
||||
#define FF_API_PKT_DUMP (LIBAVFORMAT_VERSION_MAJOR < 54)
|
||||
#endif
|
||||
#ifndef FF_API_ALLOC_OUTPUT_CONTEXT
|
||||
#define FF_API_ALLOC_OUTPUT_CONTEXT (LIBAVFORMAT_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_FORMAT_PARAMETERS
|
||||
#define FF_API_FORMAT_PARAMETERS (LIBAVFORMAT_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_NEW_STREAM
|
||||
#define FF_API_NEW_STREAM (LIBAVFORMAT_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_SET_PTS_INFO
|
||||
#define FF_API_SET_PTS_INFO (LIBAVFORMAT_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_CLOSE_INPUT_FILE
|
||||
#define FF_API_CLOSE_INPUT_FILE (LIBAVFORMAT_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_READ_PACKET
|
||||
#define FF_API_READ_PACKET (LIBAVFORMAT_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_ASS_SSA
|
||||
#define FF_API_ASS_SSA (LIBAVFORMAT_VERSION_MAJOR < 56)
|
||||
#endif
|
||||
#ifndef FF_API_R_FRAME_RATE
|
||||
#define FF_API_R_FRAME_RATE 1
|
||||
#endif
|
||||
#endif /* AVFORMAT_VERSION_H */
|
52
3rdparty/include/ffmpeg_/libavutil/adler32.h
vendored
52
3rdparty/include/ffmpeg_/libavutil/adler32.h
vendored
@ -1,52 +0,0 @@
|
||||
/*
|
||||
* copyright (c) 2006 Mans Rullgard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_ADLER32_H
|
||||
#define AVUTIL_ADLER32_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "attributes.h"
|
||||
|
||||
/**
|
||||
* @defgroup lavu_adler32 Adler32
|
||||
* @ingroup lavu_crypto
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Calculate the Adler32 checksum of a buffer.
|
||||
*
|
||||
* Passing the return value to a subsequent av_adler32_update() call
|
||||
* allows the checksum of multiple buffers to be calculated as though
|
||||
* they were concatenated.
|
||||
*
|
||||
* @param adler initial checksum value
|
||||
* @param buf pointer to input buffer
|
||||
* @param len size of input buffer
|
||||
* @return updated checksum
|
||||
*/
|
||||
unsigned long av_adler32_update(unsigned long adler, const uint8_t *buf,
|
||||
unsigned int len) av_pure;
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVUTIL_ADLER32_H */
|
65
3rdparty/include/ffmpeg_/libavutil/aes.h
vendored
65
3rdparty/include/ffmpeg_/libavutil/aes.h
vendored
@ -1,65 +0,0 @@
|
||||
/*
|
||||
* copyright (c) 2007 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_AES_H
|
||||
#define AVUTIL_AES_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "attributes.h"
|
||||
#include "version.h"
|
||||
|
||||
/**
|
||||
* @defgroup lavu_aes AES
|
||||
* @ingroup lavu_crypto
|
||||
* @{
|
||||
*/
|
||||
|
||||
extern const int av_aes_size;
|
||||
|
||||
struct AVAES;
|
||||
|
||||
/**
|
||||
* Allocate an AVAES context.
|
||||
*/
|
||||
struct AVAES *av_aes_alloc(void);
|
||||
|
||||
/**
|
||||
* Initialize an AVAES context.
|
||||
* @param key_bits 128, 192 or 256
|
||||
* @param decrypt 0 for encryption, 1 for decryption
|
||||
*/
|
||||
int av_aes_init(struct AVAES *a, const uint8_t *key, int key_bits, int decrypt);
|
||||
|
||||
/**
|
||||
* Encrypt or decrypt a buffer using a previously initialized context.
|
||||
* @param count number of 16 byte blocks
|
||||
* @param dst destination array, can be equal to src
|
||||
* @param src source array, can be equal to dst
|
||||
* @param iv initialization vector for CBC mode, if NULL then ECB will be used
|
||||
* @param decrypt 0 for encryption, 1 for decryption
|
||||
*/
|
||||
void av_aes_crypt(struct AVAES *a, uint8_t *dst, const uint8_t *src, int count, uint8_t *iv, int decrypt);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVUTIL_AES_H */
|
154
3rdparty/include/ffmpeg_/libavutil/attributes.h
vendored
154
3rdparty/include/ffmpeg_/libavutil/attributes.h
vendored
@ -1,154 +0,0 @@
|
||||
/*
|
||||
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Macro definitions for various function/variable attributes
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_ATTRIBUTES_H
|
||||
#define AVUTIL_ATTRIBUTES_H
|
||||
|
||||
#ifdef __GNUC__
|
||||
# define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
|
||||
#else
|
||||
# define AV_GCC_VERSION_AT_LEAST(x,y) 0
|
||||
#endif
|
||||
|
||||
#ifndef av_always_inline
|
||||
#if AV_GCC_VERSION_AT_LEAST(3,1)
|
||||
# define av_always_inline __attribute__((always_inline)) inline
|
||||
#elif defined(_MSC_VER)
|
||||
# define av_always_inline __forceinline
|
||||
#else
|
||||
# define av_always_inline inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef av_extern_inline
|
||||
#if defined(__ICL) && __ICL >= 1210 || defined(__GNUC_STDC_INLINE__)
|
||||
# define av_extern_inline extern inline
|
||||
#else
|
||||
# define av_extern_inline inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if AV_GCC_VERSION_AT_LEAST(3,1)
|
||||
# define av_noinline __attribute__((noinline))
|
||||
#else
|
||||
# define av_noinline
|
||||
#endif
|
||||
|
||||
#if AV_GCC_VERSION_AT_LEAST(3,1)
|
||||
# define av_pure __attribute__((pure))
|
||||
#else
|
||||
# define av_pure
|
||||
#endif
|
||||
|
||||
#ifndef av_restrict
|
||||
#define av_restrict restrict
|
||||
#endif
|
||||
|
||||
#if AV_GCC_VERSION_AT_LEAST(2,6)
|
||||
# define av_const __attribute__((const))
|
||||
#else
|
||||
# define av_const
|
||||
#endif
|
||||
|
||||
#if AV_GCC_VERSION_AT_LEAST(4,3)
|
||||
# define av_cold __attribute__((cold))
|
||||
#else
|
||||
# define av_cold
|
||||
#endif
|
||||
|
||||
#if AV_GCC_VERSION_AT_LEAST(4,1)
|
||||
# define av_flatten __attribute__((flatten))
|
||||
#else
|
||||
# define av_flatten
|
||||
#endif
|
||||
|
||||
#if AV_GCC_VERSION_AT_LEAST(3,1)
|
||||
# define attribute_deprecated __attribute__((deprecated))
|
||||
#else
|
||||
# define attribute_deprecated
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Disable warnings about deprecated features
|
||||
* This is useful for sections of code kept for backward compatibility and
|
||||
* scheduled for removal.
|
||||
*/
|
||||
#ifndef AV_NOWARN_DEPRECATED
|
||||
#if AV_GCC_VERSION_AT_LEAST(4,6)
|
||||
# define AV_NOWARN_DEPRECATED(code) \
|
||||
_Pragma("GCC diagnostic push") \
|
||||
_Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") \
|
||||
code \
|
||||
_Pragma("GCC diagnostic pop")
|
||||
#else
|
||||
# define AV_NOWARN_DEPRECATED(code) code
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__GNUC__)
|
||||
# define av_unused __attribute__((unused))
|
||||
#else
|
||||
# define av_unused
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Mark a variable as used and prevent the compiler from optimizing it
|
||||
* away. This is useful for variables accessed only from inline
|
||||
* assembler without the compiler being aware.
|
||||
*/
|
||||
#if AV_GCC_VERSION_AT_LEAST(3,1)
|
||||
# define av_used __attribute__((used))
|
||||
#else
|
||||
# define av_used
|
||||
#endif
|
||||
|
||||
#if AV_GCC_VERSION_AT_LEAST(3,3)
|
||||
# define av_alias __attribute__((may_alias))
|
||||
#else
|
||||
# define av_alias
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
|
||||
# define av_uninit(x) x=x
|
||||
#else
|
||||
# define av_uninit(x) x
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
# define av_builtin_constant_p __builtin_constant_p
|
||||
# define av_printf_format(fmtpos, attrpos) __attribute__((__format__(__printf__, fmtpos, attrpos)))
|
||||
#else
|
||||
# define av_builtin_constant_p(x) 0
|
||||
# define av_printf_format(fmtpos, attrpos)
|
||||
#endif
|
||||
|
||||
#if AV_GCC_VERSION_AT_LEAST(2,5)
|
||||
# define av_noreturn __attribute__((noreturn))
|
||||
#else
|
||||
# define av_noreturn
|
||||
#endif
|
||||
|
||||
#endif /* AVUTIL_ATTRIBUTES_H */
|
149
3rdparty/include/ffmpeg_/libavutil/audio_fifo.h
vendored
149
3rdparty/include/ffmpeg_/libavutil/audio_fifo.h
vendored
@ -1,149 +0,0 @@
|
||||
/*
|
||||
* Audio FIFO
|
||||
* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Audio FIFO Buffer
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_AUDIO_FIFO_H
|
||||
#define AVUTIL_AUDIO_FIFO_H
|
||||
|
||||
#include "avutil.h"
|
||||
#include "fifo.h"
|
||||
#include "samplefmt.h"
|
||||
|
||||
/**
|
||||
* @addtogroup lavu_audio
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Context for an Audio FIFO Buffer.
|
||||
*
|
||||
* - Operates at the sample level rather than the byte level.
|
||||
* - Supports multiple channels with either planar or packed sample format.
|
||||
* - Automatic reallocation when writing to a full buffer.
|
||||
*/
|
||||
typedef struct AVAudioFifo AVAudioFifo;
|
||||
|
||||
/**
|
||||
* Free an AVAudioFifo.
|
||||
*
|
||||
* @param af AVAudioFifo to free
|
||||
*/
|
||||
void av_audio_fifo_free(AVAudioFifo *af);
|
||||
|
||||
/**
|
||||
* Allocate an AVAudioFifo.
|
||||
*
|
||||
* @param sample_fmt sample format
|
||||
* @param channels number of channels
|
||||
* @param nb_samples initial allocation size, in samples
|
||||
* @return newly allocated AVAudioFifo, or NULL on error
|
||||
*/
|
||||
AVAudioFifo *av_audio_fifo_alloc(enum AVSampleFormat sample_fmt, int channels,
|
||||
int nb_samples);
|
||||
|
||||
/**
|
||||
* Reallocate an AVAudioFifo.
|
||||
*
|
||||
* @param af AVAudioFifo to reallocate
|
||||
* @param nb_samples new allocation size, in samples
|
||||
* @return 0 if OK, or negative AVERROR code on failure
|
||||
*/
|
||||
int av_audio_fifo_realloc(AVAudioFifo *af, int nb_samples);
|
||||
|
||||
/**
|
||||
* Write data to an AVAudioFifo.
|
||||
*
|
||||
* The AVAudioFifo will be reallocated automatically if the available space
|
||||
* is less than nb_samples.
|
||||
*
|
||||
* @see enum AVSampleFormat
|
||||
* The documentation for AVSampleFormat describes the data layout.
|
||||
*
|
||||
* @param af AVAudioFifo to write to
|
||||
* @param data audio data plane pointers
|
||||
* @param nb_samples number of samples to write
|
||||
* @return number of samples actually written, or negative AVERROR
|
||||
* code on failure. If successful, the number of samples
|
||||
* actually written will always be nb_samples.
|
||||
*/
|
||||
int av_audio_fifo_write(AVAudioFifo *af, void **data, int nb_samples);
|
||||
|
||||
/**
|
||||
* Read data from an AVAudioFifo.
|
||||
*
|
||||
* @see enum AVSampleFormat
|
||||
* The documentation for AVSampleFormat describes the data layout.
|
||||
*
|
||||
* @param af AVAudioFifo to read from
|
||||
* @param data audio data plane pointers
|
||||
* @param nb_samples number of samples to read
|
||||
* @return number of samples actually read, or negative AVERROR code
|
||||
* on failure. The number of samples actually read will not
|
||||
* be greater than nb_samples, and will only be less than
|
||||
* nb_samples if av_audio_fifo_size is less than nb_samples.
|
||||
*/
|
||||
int av_audio_fifo_read(AVAudioFifo *af, void **data, int nb_samples);
|
||||
|
||||
/**
|
||||
* Drain data from an AVAudioFifo.
|
||||
*
|
||||
* Removes the data without reading it.
|
||||
*
|
||||
* @param af AVAudioFifo to drain
|
||||
* @param nb_samples number of samples to drain
|
||||
* @return 0 if OK, or negative AVERROR code on failure
|
||||
*/
|
||||
int av_audio_fifo_drain(AVAudioFifo *af, int nb_samples);
|
||||
|
||||
/**
|
||||
* Reset the AVAudioFifo buffer.
|
||||
*
|
||||
* This empties all data in the buffer.
|
||||
*
|
||||
* @param af AVAudioFifo to reset
|
||||
*/
|
||||
void av_audio_fifo_reset(AVAudioFifo *af);
|
||||
|
||||
/**
|
||||
* Get the current number of samples in the AVAudioFifo available for reading.
|
||||
*
|
||||
* @param af the AVAudioFifo to query
|
||||
* @return number of samples available for reading
|
||||
*/
|
||||
int av_audio_fifo_size(AVAudioFifo *af);
|
||||
|
||||
/**
|
||||
* Get the current number of samples in the AVAudioFifo available for writing.
|
||||
*
|
||||
* @param af the AVAudioFifo to query
|
||||
* @return number of samples available for writing
|
||||
*/
|
||||
int av_audio_fifo_space(AVAudioFifo *af);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVUTIL_AUDIO_FIFO_H */
|
@ -1,6 +0,0 @@
|
||||
|
||||
#include "version.h"
|
||||
|
||||
#if FF_API_AUDIOCONVERT
|
||||
#include "channel_layout.h"
|
||||
#endif
|
66
3rdparty/include/ffmpeg_/libavutil/avassert.h
vendored
66
3rdparty/include/ffmpeg_/libavutil/avassert.h
vendored
@ -1,66 +0,0 @@
|
||||
/*
|
||||
* copyright (c) 2010 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* simple assert() macros that are a bit more flexible than ISO C assert().
|
||||
* @author Michael Niedermayer <michaelni@gmx.at>
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_AVASSERT_H
|
||||
#define AVUTIL_AVASSERT_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "avutil.h"
|
||||
#include "log.h"
|
||||
|
||||
/**
|
||||
* assert() equivalent, that is always enabled.
|
||||
*/
|
||||
#define av_assert0(cond) do { \
|
||||
if (!(cond)) { \
|
||||
av_log(NULL, AV_LOG_PANIC, "Assertion %s failed at %s:%d\n", \
|
||||
AV_STRINGIFY(cond), __FILE__, __LINE__); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
/**
|
||||
* assert() equivalent, that does not lie in speed critical code.
|
||||
* These asserts() thus can be enabled without fearing speedloss.
|
||||
*/
|
||||
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 0
|
||||
#define av_assert1(cond) av_assert0(cond)
|
||||
#else
|
||||
#define av_assert1(cond) ((void)0)
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* assert() equivalent, that does lie in speed critical code.
|
||||
*/
|
||||
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
|
||||
#define av_assert2(cond) av_assert0(cond)
|
||||
#else
|
||||
#define av_assert2(cond) ((void)0)
|
||||
#endif
|
||||
|
||||
#endif /* AVUTIL_AVASSERT_H */
|
@ -1,8 +0,0 @@
|
||||
/* Generated by ffconf */
|
||||
#ifndef AVUTIL_AVCONFIG_H
|
||||
#define AVUTIL_AVCONFIG_H
|
||||
#define AV_HAVE_BIGENDIAN 0
|
||||
#define AV_HAVE_FAST_UNALIGNED 1
|
||||
#define AV_HAVE_INCOMPATIBLE_LIBAV_ABI 0
|
||||
#define AV_HAVE_INCOMPATIBLE_FORK_ABI 0
|
||||
#endif /* AVUTIL_AVCONFIG_H */
|
302
3rdparty/include/ffmpeg_/libavutil/avstring.h
vendored
302
3rdparty/include/ffmpeg_/libavutil/avstring.h
vendored
@ -1,302 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2007 Mans Rullgard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_AVSTRING_H
|
||||
#define AVUTIL_AVSTRING_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include "attributes.h"
|
||||
|
||||
/**
|
||||
* @addtogroup lavu_string
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Return non-zero if pfx is a prefix of str. If it is, *ptr is set to
|
||||
* the address of the first character in str after the prefix.
|
||||
*
|
||||
* @param str input string
|
||||
* @param pfx prefix to test
|
||||
* @param ptr updated if the prefix is matched inside str
|
||||
* @return non-zero if the prefix matches, zero otherwise
|
||||
*/
|
||||
int av_strstart(const char *str, const char *pfx, const char **ptr);
|
||||
|
||||
/**
|
||||
* Return non-zero if pfx is a prefix of str independent of case. If
|
||||
* it is, *ptr is set to the address of the first character in str
|
||||
* after the prefix.
|
||||
*
|
||||
* @param str input string
|
||||
* @param pfx prefix to test
|
||||
* @param ptr updated if the prefix is matched inside str
|
||||
* @return non-zero if the prefix matches, zero otherwise
|
||||
*/
|
||||
int av_stristart(const char *str, const char *pfx, const char **ptr);
|
||||
|
||||
/**
|
||||
* Locate the first case-independent occurrence in the string haystack
|
||||
* of the string needle. A zero-length string needle is considered to
|
||||
* match at the start of haystack.
|
||||
*
|
||||
* This function is a case-insensitive version of the standard strstr().
|
||||
*
|
||||
* @param haystack string to search in
|
||||
* @param needle string to search for
|
||||
* @return pointer to the located match within haystack
|
||||
* or a null pointer if no match
|
||||
*/
|
||||
char *av_stristr(const char *haystack, const char *needle);
|
||||
|
||||
/**
|
||||
* Locate the first occurrence of the string needle in the string haystack
|
||||
* where not more than hay_length characters are searched. A zero-length
|
||||
* string needle is considered to match at the start of haystack.
|
||||
*
|
||||
* This function is a length-limited version of the standard strstr().
|
||||
*
|
||||
* @param haystack string to search in
|
||||
* @param needle string to search for
|
||||
* @param hay_length length of string to search in
|
||||
* @return pointer to the located match within haystack
|
||||
* or a null pointer if no match
|
||||
*/
|
||||
char *av_strnstr(const char *haystack, const char *needle, size_t hay_length);
|
||||
|
||||
/**
|
||||
* Copy the string src to dst, but no more than size - 1 bytes, and
|
||||
* null-terminate dst.
|
||||
*
|
||||
* This function is the same as BSD strlcpy().
|
||||
*
|
||||
* @param dst destination buffer
|
||||
* @param src source string
|
||||
* @param size size of destination buffer
|
||||
* @return the length of src
|
||||
*
|
||||
* @warning since the return value is the length of src, src absolutely
|
||||
* _must_ be a properly 0-terminated string, otherwise this will read beyond
|
||||
* the end of the buffer and possibly crash.
|
||||
*/
|
||||
size_t av_strlcpy(char *dst, const char *src, size_t size);
|
||||
|
||||
/**
|
||||
* Append the string src to the string dst, but to a total length of
|
||||
* no more than size - 1 bytes, and null-terminate dst.
|
||||
*
|
||||
* This function is similar to BSD strlcat(), but differs when
|
||||
* size <= strlen(dst).
|
||||
*
|
||||
* @param dst destination buffer
|
||||
* @param src source string
|
||||
* @param size size of destination buffer
|
||||
* @return the total length of src and dst
|
||||
*
|
||||
* @warning since the return value use the length of src and dst, these
|
||||
* absolutely _must_ be a properly 0-terminated strings, otherwise this
|
||||
* will read beyond the end of the buffer and possibly crash.
|
||||
*/
|
||||
size_t av_strlcat(char *dst, const char *src, size_t size);
|
||||
|
||||
/**
|
||||
* Append output to a string, according to a format. Never write out of
|
||||
* the destination buffer, and always put a terminating 0 within
|
||||
* the buffer.
|
||||
* @param dst destination buffer (string to which the output is
|
||||
* appended)
|
||||
* @param size total size of the destination buffer
|
||||
* @param fmt printf-compatible format string, specifying how the
|
||||
* following parameters are used
|
||||
* @return the length of the string that would have been generated
|
||||
* if enough space had been available
|
||||
*/
|
||||
size_t av_strlcatf(char *dst, size_t size, const char *fmt, ...) av_printf_format(3, 4);
|
||||
|
||||
/**
|
||||
* Print arguments following specified format into a large enough auto
|
||||
* allocated buffer. It is similar to GNU asprintf().
|
||||
* @param fmt printf-compatible format string, specifying how the
|
||||
* following parameters are used.
|
||||
* @return the allocated string
|
||||
* @note You have to free the string yourself with av_free().
|
||||
*/
|
||||
char *av_asprintf(const char *fmt, ...) av_printf_format(1, 2);
|
||||
|
||||
/**
|
||||
* Convert a number to a av_malloced string.
|
||||
*/
|
||||
char *av_d2str(double d);
|
||||
|
||||
/**
|
||||
* Unescape the given string until a non escaped terminating char,
|
||||
* and return the token corresponding to the unescaped string.
|
||||
*
|
||||
* The normal \ and ' escaping is supported. Leading and trailing
|
||||
* whitespaces are removed, unless they are escaped with '\' or are
|
||||
* enclosed between ''.
|
||||
*
|
||||
* @param buf the buffer to parse, buf will be updated to point to the
|
||||
* terminating char
|
||||
* @param term a 0-terminated list of terminating chars
|
||||
* @return the malloced unescaped string, which must be av_freed by
|
||||
* the user, NULL in case of allocation failure
|
||||
*/
|
||||
char *av_get_token(const char **buf, const char *term);
|
||||
|
||||
/**
|
||||
* Split the string into several tokens which can be accessed by
|
||||
* successive calls to av_strtok().
|
||||
*
|
||||
* A token is defined as a sequence of characters not belonging to the
|
||||
* set specified in delim.
|
||||
*
|
||||
* On the first call to av_strtok(), s should point to the string to
|
||||
* parse, and the value of saveptr is ignored. In subsequent calls, s
|
||||
* should be NULL, and saveptr should be unchanged since the previous
|
||||
* call.
|
||||
*
|
||||
* This function is similar to strtok_r() defined in POSIX.1.
|
||||
*
|
||||
* @param s the string to parse, may be NULL
|
||||
* @param delim 0-terminated list of token delimiters, must be non-NULL
|
||||
* @param saveptr user-provided pointer which points to stored
|
||||
* information necessary for av_strtok() to continue scanning the same
|
||||
* string. saveptr is updated to point to the next character after the
|
||||
* first delimiter found, or to NULL if the string was terminated
|
||||
* @return the found token, or NULL when no token is found
|
||||
*/
|
||||
char *av_strtok(char *s, const char *delim, char **saveptr);
|
||||
|
||||
/**
|
||||
* Locale-independent conversion of ASCII isdigit.
|
||||
*/
|
||||
int av_isdigit(int c);
|
||||
|
||||
/**
|
||||
* Locale-independent conversion of ASCII isgraph.
|
||||
*/
|
||||
int av_isgraph(int c);
|
||||
|
||||
/**
|
||||
* Locale-independent conversion of ASCII isspace.
|
||||
*/
|
||||
int av_isspace(int c);
|
||||
|
||||
/**
|
||||
* Locale-independent conversion of ASCII characters to uppercase.
|
||||
*/
|
||||
static inline int av_toupper(int c)
|
||||
{
|
||||
if (c >= 'a' && c <= 'z')
|
||||
c ^= 0x20;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Locale-independent conversion of ASCII characters to lowercase.
|
||||
*/
|
||||
static inline int av_tolower(int c)
|
||||
{
|
||||
if (c >= 'A' && c <= 'Z')
|
||||
c ^= 0x20;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Locale-independent conversion of ASCII isxdigit.
|
||||
*/
|
||||
int av_isxdigit(int c);
|
||||
|
||||
/**
|
||||
* Locale-independent case-insensitive compare.
|
||||
* @note This means only ASCII-range characters are case-insensitive
|
||||
*/
|
||||
int av_strcasecmp(const char *a, const char *b);
|
||||
|
||||
/**
|
||||
* Locale-independent case-insensitive compare.
|
||||
* @note This means only ASCII-range characters are case-insensitive
|
||||
*/
|
||||
int av_strncasecmp(const char *a, const char *b, size_t n);
|
||||
|
||||
|
||||
/**
|
||||
* Thread safe basename.
|
||||
* @param path the path, on DOS both \ and / are considered separators.
|
||||
* @return pointer to the basename substring.
|
||||
*/
|
||||
const char *av_basename(const char *path);
|
||||
|
||||
/**
|
||||
* Thread safe dirname.
|
||||
* @param path the path, on DOS both \ and / are considered separators.
|
||||
* @return the path with the separator replaced by the string terminator or ".".
|
||||
* @note the function may change the input string.
|
||||
*/
|
||||
const char *av_dirname(char *path);
|
||||
|
||||
enum AVEscapeMode {
|
||||
AV_ESCAPE_MODE_AUTO, ///< Use auto-selected escaping mode.
|
||||
AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
|
||||
AV_ESCAPE_MODE_QUOTE, ///< Use single-quote escaping.
|
||||
};
|
||||
|
||||
/**
|
||||
* Consider spaces special and escape them even in the middle of the
|
||||
* string.
|
||||
*
|
||||
* This is equivalent to adding the whitespace characters to the special
|
||||
* characters lists, except it is guaranteed to use the exact same list
|
||||
* of whitespace characters as the rest of libavutil.
|
||||
*/
|
||||
#define AV_ESCAPE_FLAG_WHITESPACE 0x01
|
||||
|
||||
/**
|
||||
* Escape only specified special characters.
|
||||
* Without this flag, escape also any characters that may be considered
|
||||
* special by av_get_token(), such as the single quote.
|
||||
*/
|
||||
#define AV_ESCAPE_FLAG_STRICT 0x02
|
||||
|
||||
/**
|
||||
* Escape string in src, and put the escaped string in an allocated
|
||||
* string in *dst, which must be freed with av_free().
|
||||
*
|
||||
* @param dst pointer where an allocated string is put
|
||||
* @param src string to escape, must be non-NULL
|
||||
* @param special_chars string containing the special characters which
|
||||
* need to be escaped, can be NULL
|
||||
* @param mode escape mode to employ, see AV_ESCAPE_MODE_* macros.
|
||||
* Any unknown value for mode will be considered equivalent to
|
||||
* AV_ESCAPE_MODE_BACKSLASH, but this behaviour can change without
|
||||
* notice.
|
||||
* @param flags flags which control how to escape, see AV_ESCAPE_FLAG_ macros
|
||||
* @return the length of the allocated string, or a negative error code in case of error
|
||||
* @see av_bprint_escape()
|
||||
*/
|
||||
int av_escape(char **dst, const char *src, const char *special_chars,
|
||||
enum AVEscapeMode mode, int flags);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVUTIL_AVSTRING_H */
|
314
3rdparty/include/ffmpeg_/libavutil/avutil.h
vendored
314
3rdparty/include/ffmpeg_/libavutil/avutil.h
vendored
@ -1,314 +0,0 @@
|
||||
/*
|
||||
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_AVUTIL_H
|
||||
#define AVUTIL_AVUTIL_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* external API header
|
||||
*/
|
||||
|
||||
/**
|
||||
* @mainpage
|
||||
*
|
||||
* @section ffmpeg_intro Introduction
|
||||
*
|
||||
* This document describes the usage of the different libraries
|
||||
* provided by FFmpeg.
|
||||
*
|
||||
* @li @ref libavc "libavcodec" encoding/decoding library
|
||||
* @li @ref lavfi "libavfilter" graph-based frame editing library
|
||||
* @li @ref libavf "libavformat" I/O and muxing/demuxing library
|
||||
* @li @ref lavd "libavdevice" special devices muxing/demuxing library
|
||||
* @li @ref lavu "libavutil" common utility library
|
||||
* @li @ref lswr "libswresample" audio resampling, format conversion and mixing
|
||||
* @li @ref lpp "libpostproc" post processing library
|
||||
* @li @ref lsws "libswscale" color conversion and scaling library
|
||||
*
|
||||
* @section ffmpeg_versioning Versioning and compatibility
|
||||
*
|
||||
* Each of the FFmpeg libraries contains a version.h header, which defines a
|
||||
* major, minor and micro version number with the
|
||||
* <em>LIBRARYNAME_VERSION_{MAJOR,MINOR,MICRO}</em> macros. The major version
|
||||
* number is incremented with backward incompatible changes - e.g. removing
|
||||
* parts of the public API, reordering public struct members, etc. The minor
|
||||
* version number is incremented for backward compatible API changes or major
|
||||
* new features - e.g. adding a new public function or a new decoder. The micro
|
||||
* version number is incremented for smaller changes that a calling program
|
||||
* might still want to check for - e.g. changing behavior in a previously
|
||||
* unspecified situation.
|
||||
*
|
||||
* FFmpeg guarantees backward API and ABI compatibility for each library as long
|
||||
* as its major version number is unchanged. This means that no public symbols
|
||||
* will be removed or renamed. Types and names of the public struct members and
|
||||
* values of public macros and enums will remain the same (unless they were
|
||||
* explicitly declared as not part of the public API). Documented behavior will
|
||||
* not change.
|
||||
*
|
||||
* In other words, any correct program that works with a given FFmpeg snapshot
|
||||
* should work just as well without any changes with any later snapshot with the
|
||||
* same major versions. This applies to both rebuilding the program against new
|
||||
* FFmpeg versions or to replacing the dynamic FFmpeg libraries that a program
|
||||
* links against.
|
||||
*
|
||||
* However, new public symbols may be added and new members may be appended to
|
||||
* public structs whose size is not part of public ABI (most public structs in
|
||||
* FFmpeg). New macros and enum values may be added. Behavior in undocumented
|
||||
* situations may change slightly (and be documented). All those are accompanied
|
||||
* by an entry in doc/APIchanges and incrementing either the minor or micro
|
||||
* version number.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup lavu Common utility functions
|
||||
*
|
||||
* @brief
|
||||
* libavutil contains the code shared across all the other FFmpeg
|
||||
* libraries
|
||||
*
|
||||
* @note In order to use the functions provided by avutil you must include
|
||||
* the specific header.
|
||||
*
|
||||
* @{
|
||||
*
|
||||
* @defgroup lavu_crypto Crypto and Hashing
|
||||
*
|
||||
* @{
|
||||
* @}
|
||||
*
|
||||
* @defgroup lavu_math Maths
|
||||
* @{
|
||||
*
|
||||
* @}
|
||||
*
|
||||
* @defgroup lavu_string String Manipulation
|
||||
*
|
||||
* @{
|
||||
*
|
||||
* @}
|
||||
*
|
||||
* @defgroup lavu_mem Memory Management
|
||||
*
|
||||
* @{
|
||||
*
|
||||
* @}
|
||||
*
|
||||
* @defgroup lavu_data Data Structures
|
||||
* @{
|
||||
*
|
||||
* @}
|
||||
*
|
||||
* @defgroup lavu_audio Audio related
|
||||
*
|
||||
* @{
|
||||
*
|
||||
* @}
|
||||
*
|
||||
* @defgroup lavu_error Error Codes
|
||||
*
|
||||
* @{
|
||||
*
|
||||
* @}
|
||||
*
|
||||
* @defgroup lavu_misc Other
|
||||
*
|
||||
* @{
|
||||
*
|
||||
* @defgroup lavu_internal Internal
|
||||
*
|
||||
* Not exported functions, for internal usage only
|
||||
*
|
||||
* @{
|
||||
*
|
||||
* @}
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @addtogroup lavu_ver
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Return the LIBAVUTIL_VERSION_INT constant.
|
||||
*/
|
||||
unsigned avutil_version(void);
|
||||
|
||||
/**
|
||||
* Return the libavutil build-time configuration.
|
||||
*/
|
||||
const char *avutil_configuration(void);
|
||||
|
||||
/**
|
||||
* Return the libavutil license.
|
||||
*/
|
||||
const char *avutil_license(void);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup lavu_media Media Type
|
||||
* @brief Media Type
|
||||
*/
|
||||
|
||||
enum AVMediaType {
|
||||
AVMEDIA_TYPE_UNKNOWN = -1, ///< Usually treated as AVMEDIA_TYPE_DATA
|
||||
AVMEDIA_TYPE_VIDEO,
|
||||
AVMEDIA_TYPE_AUDIO,
|
||||
AVMEDIA_TYPE_DATA, ///< Opaque data information usually continuous
|
||||
AVMEDIA_TYPE_SUBTITLE,
|
||||
AVMEDIA_TYPE_ATTACHMENT, ///< Opaque data information usually sparse
|
||||
AVMEDIA_TYPE_NB
|
||||
};
|
||||
|
||||
/**
|
||||
* Return a string describing the media_type enum, NULL if media_type
|
||||
* is unknown.
|
||||
*/
|
||||
const char *av_get_media_type_string(enum AVMediaType media_type);
|
||||
|
||||
/**
|
||||
* @defgroup lavu_const Constants
|
||||
* @{
|
||||
*
|
||||
* @defgroup lavu_enc Encoding specific
|
||||
*
|
||||
* @note those definition should move to avcodec
|
||||
* @{
|
||||
*/
|
||||
|
||||
#define FF_LAMBDA_SHIFT 7
|
||||
#define FF_LAMBDA_SCALE (1<<FF_LAMBDA_SHIFT)
|
||||
#define FF_QP2LAMBDA 118 ///< factor to convert from H.263 QP to lambda
|
||||
#define FF_LAMBDA_MAX (256*128-1)
|
||||
|
||||
#define FF_QUALITY_SCALE FF_LAMBDA_SCALE //FIXME maybe remove
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @defgroup lavu_time Timestamp specific
|
||||
*
|
||||
* FFmpeg internal timebase and timestamp definitions
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Undefined timestamp value
|
||||
*
|
||||
* Usually reported by demuxer that work on containers that do not provide
|
||||
* either pts or dts.
|
||||
*/
|
||||
|
||||
#define AV_NOPTS_VALUE ((int64_t)UINT64_C(0x8000000000000000))
|
||||
|
||||
/**
|
||||
* Internal time base represented as integer
|
||||
*/
|
||||
|
||||
#define AV_TIME_BASE 1000000
|
||||
|
||||
/**
|
||||
* Internal time base represented as fractional value
|
||||
*/
|
||||
|
||||
#define AV_TIME_BASE_Q (AVRational){1, AV_TIME_BASE}
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @}
|
||||
* @defgroup lavu_picture Image related
|
||||
*
|
||||
* AVPicture types, pixel formats and basic image planes manipulation.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
enum AVPictureType {
|
||||
AV_PICTURE_TYPE_NONE = 0, ///< Undefined
|
||||
AV_PICTURE_TYPE_I, ///< Intra
|
||||
AV_PICTURE_TYPE_P, ///< Predicted
|
||||
AV_PICTURE_TYPE_B, ///< Bi-dir predicted
|
||||
AV_PICTURE_TYPE_S, ///< S(GMC)-VOP MPEG4
|
||||
AV_PICTURE_TYPE_SI, ///< Switching Intra
|
||||
AV_PICTURE_TYPE_SP, ///< Switching Predicted
|
||||
AV_PICTURE_TYPE_BI, ///< BI type
|
||||
};
|
||||
|
||||
/**
|
||||
* Return a single letter to describe the given picture type
|
||||
* pict_type.
|
||||
*
|
||||
* @param[in] pict_type the picture type @return a single character
|
||||
* representing the picture type, '?' if pict_type is unknown
|
||||
*/
|
||||
char av_get_picture_type_char(enum AVPictureType pict_type);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
#include "error.h"
|
||||
#include "version.h"
|
||||
#include "mathematics.h"
|
||||
#include "rational.h"
|
||||
#include "intfloat_readwrite.h"
|
||||
#include "log.h"
|
||||
#include "pixfmt.h"
|
||||
|
||||
/**
|
||||
* Return x default pointer in case p is NULL.
|
||||
*/
|
||||
static inline void *av_x_if_null(const void *p, const void *x)
|
||||
{
|
||||
return (void *)(intptr_t)(p ? p : x);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the length of an integer list.
|
||||
*
|
||||
* @param elsize size in bytes of each list element (only 1, 2, 4 or 8)
|
||||
* @param term list terminator (usually 0 or -1)
|
||||
* @param list pointer to the list
|
||||
* @return length of the list, in elements, not counting the terminator
|
||||
*/
|
||||
unsigned av_int_list_length_for_size(unsigned elsize,
|
||||
const void *list, uint64_t term) av_pure;
|
||||
|
||||
/**
|
||||
* Compute the length of an integer list.
|
||||
*
|
||||
* @param term list terminator (usually 0 or -1)
|
||||
* @param list pointer to the list
|
||||
* @return length of the list, in elements, not counting the terminator
|
||||
*/
|
||||
#define av_int_list_length(list, term) \
|
||||
av_int_list_length_for_size(sizeof(*(list)), list, term)
|
||||
|
||||
/**
|
||||
* @}
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVUTIL_AVUTIL_H */
|
67
3rdparty/include/ffmpeg_/libavutil/base64.h
vendored
67
3rdparty/include/ffmpeg_/libavutil/base64.h
vendored
@ -1,67 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2006 Ryan Martell. (rdm4@martellventures.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_BASE64_H
|
||||
#define AVUTIL_BASE64_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* @defgroup lavu_base64 Base64
|
||||
* @ingroup lavu_crypto
|
||||
* @{
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Decode a base64-encoded string.
|
||||
*
|
||||
* @param out buffer for decoded data
|
||||
* @param in null-terminated input string
|
||||
* @param out_size size in bytes of the out buffer, must be at
|
||||
* least 3/4 of the length of in
|
||||
* @return number of bytes written, or a negative value in case of
|
||||
* invalid input
|
||||
*/
|
||||
int av_base64_decode(uint8_t *out, const char *in, int out_size);
|
||||
|
||||
/**
|
||||
* Encode data to base64 and null-terminate.
|
||||
*
|
||||
* @param out buffer for encoded data
|
||||
* @param out_size size in bytes of the out buffer (including the
|
||||
* null terminator), must be at least AV_BASE64_SIZE(in_size)
|
||||
* @param in input buffer containing the data to encode
|
||||
* @param in_size size in bytes of the in buffer
|
||||
* @return out or NULL in case of error
|
||||
*/
|
||||
char *av_base64_encode(char *out, int out_size, const uint8_t *in, int in_size);
|
||||
|
||||
/**
|
||||
* Calculate the output size needed to base64-encode x bytes to a
|
||||
* null-terminated string.
|
||||
*/
|
||||
#define AV_BASE64_SIZE(x) (((x)+2) / 3 * 4 + 1)
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVUTIL_BASE64_H */
|
77
3rdparty/include/ffmpeg_/libavutil/blowfish.h
vendored
77
3rdparty/include/ffmpeg_/libavutil/blowfish.h
vendored
@ -1,77 +0,0 @@
|
||||
/*
|
||||
* Blowfish algorithm
|
||||
* Copyright (c) 2012 Samuel Pitoiset
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_BLOWFISH_H
|
||||
#define AVUTIL_BLOWFISH_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* @defgroup lavu_blowfish Blowfish
|
||||
* @ingroup lavu_crypto
|
||||
* @{
|
||||
*/
|
||||
|
||||
#define AV_BF_ROUNDS 16
|
||||
|
||||
typedef struct AVBlowfish {
|
||||
uint32_t p[AV_BF_ROUNDS + 2];
|
||||
uint32_t s[4][256];
|
||||
} AVBlowfish;
|
||||
|
||||
/**
|
||||
* Initialize an AVBlowfish context.
|
||||
*
|
||||
* @param ctx an AVBlowfish context
|
||||
* @param key a key
|
||||
* @param key_len length of the key
|
||||
*/
|
||||
void av_blowfish_init(struct AVBlowfish *ctx, const uint8_t *key, int key_len);
|
||||
|
||||
/**
|
||||
* Encrypt or decrypt a buffer using a previously initialized context.
|
||||
*
|
||||
* @param ctx an AVBlowfish context
|
||||
* @param xl left four bytes halves of input to be encrypted
|
||||
* @param xr right four bytes halves of input to be encrypted
|
||||
* @param decrypt 0 for encryption, 1 for decryption
|
||||
*/
|
||||
void av_blowfish_crypt_ecb(struct AVBlowfish *ctx, uint32_t *xl, uint32_t *xr,
|
||||
int decrypt);
|
||||
|
||||
/**
|
||||
* Encrypt or decrypt a buffer using a previously initialized context.
|
||||
*
|
||||
* @param ctx an AVBlowfish context
|
||||
* @param dst destination array, can be equal to src
|
||||
* @param src source array, can be equal to dst
|
||||
* @param count number of 8 byte blocks
|
||||
* @param iv initialization vector for CBC mode, if NULL ECB will be used
|
||||
* @param decrypt 0 for encryption, 1 for decryption
|
||||
*/
|
||||
void av_blowfish_crypt(struct AVBlowfish *ctx, uint8_t *dst, const uint8_t *src,
|
||||
int count, uint8_t *iv, int decrypt);
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif /* AVUTIL_BLOWFISH_H */
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user