mirror of
https://github.com/opencv/opencv.git
synced 2025-08-05 22:19:14 +08:00
Merge pull request #24565 from CNClareChen:4.x
Change the lsx to baseline features.
This commit is contained in:
commit
3893936243
@ -403,11 +403,8 @@ elseif(LOONGARCH64)
|
||||
ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX")
|
||||
ocv_update(CPU_LSX_FLAGS_ON "-mlsx")
|
||||
ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
|
||||
if("${CPU_BASELINE_DISABLE}" STREQUAL "LASX")
|
||||
set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}")
|
||||
else()
|
||||
set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
|
||||
endif()
|
||||
set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}")
|
||||
set(CPU_DISPATCH "LASX" CACHE STRING "${HELP_CPU_DISPATCH}")
|
||||
|
||||
endif()
|
||||
|
||||
|
@ -1,21 +1,21 @@
|
||||
set(the_description "The Core Functionality")
|
||||
|
||||
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
|
||||
ocv_add_dispatched_file(stat SSE4_2 AVX2)
|
||||
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
|
||||
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
|
||||
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
|
||||
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
|
||||
ocv_add_dispatched_file(has_non_zero SSE2 AVX2)
|
||||
ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD)
|
||||
ocv_add_dispatched_file(mean SSE2 AVX2)
|
||||
ocv_add_dispatched_file(merge SSE2 AVX2)
|
||||
ocv_add_dispatched_file(split SSE2 AVX2)
|
||||
ocv_add_dispatched_file(sum SSE2 AVX2)
|
||||
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2 LASX)
|
||||
ocv_add_dispatched_file(stat SSE4_2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3 LASX)
|
||||
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3 LASX)
|
||||
ocv_add_dispatched_file(convert_scale SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(count_non_zero SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(has_non_zero SSE2 AVX2 LASX )
|
||||
ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD LASX)
|
||||
ocv_add_dispatched_file(mean SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(merge SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(split SSE2 AVX2 LASX)
|
||||
ocv_add_dispatched_file(sum SSE2 AVX2 LASX)
|
||||
|
||||
# dispatching for accuracy tests
|
||||
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
|
||||
ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
|
||||
ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX LASX)
|
||||
ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)
|
||||
|
||||
|
||||
|
@ -246,12 +246,6 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
|
||||
#include "opencv2/core/hal/intrin_lsx.hpp"
|
||||
|
||||
#elif CV_LASX
|
||||
#if !defined(CV_FORCE_SIMD128_CPP)
|
||||
#define CV_FORCE_SIMD128_CPP 1
|
||||
#endif
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
|
||||
#else
|
||||
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
|
@ -1419,20 +1419,6 @@ inline v_uint32x8 v_popcount(const v_int32x8& a)
|
||||
inline v_uint64x4 v_popcount(const v_int64x4& a)
|
||||
{ return v_popcount(v_reinterpret_as_u64(a)); }
|
||||
|
||||
/** Mask **/
|
||||
#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
|
||||
inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
|
||||
OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
|
||||
|
||||
inline int v_signmask(const v_int8x32& a)
|
||||
{
|
||||
__m256i result = __lasx_xvmskltz_b(a.val);
|
||||
@ -2151,7 +2137,8 @@ template<int n> inline
|
||||
void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
|
||||
{
|
||||
__m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
|
||||
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
@ -2165,7 +2152,8 @@ template<int n> inline
|
||||
void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
|
||||
{
|
||||
__m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
|
||||
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
@ -2179,7 +2167,8 @@ template<int n> inline
|
||||
void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
|
||||
{
|
||||
__m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
|
||||
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
// 32
|
||||
@ -2198,7 +2187,8 @@ inline void v_pack_store(short* ptr, const v_int32x8& a)
|
||||
inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
|
||||
{
|
||||
__m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
|
||||
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
|
||||
@ -2212,7 +2202,8 @@ template<int n> inline
|
||||
void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
|
||||
{
|
||||
__m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
|
||||
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
@ -2223,7 +2214,8 @@ template<int n> inline
|
||||
void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
|
||||
{
|
||||
__m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
|
||||
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
@ -2234,7 +2226,8 @@ template<int n> inline
|
||||
void v_rshr_pack_store(short* ptr, const v_int32x8& a)
|
||||
{
|
||||
__m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
|
||||
__lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
// 64
|
||||
@ -2263,7 +2256,11 @@ v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
|
||||
{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(a.val, a.val, n)), ptr, 0); }
|
||||
{
|
||||
__m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
template<int n> inline
|
||||
v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
|
||||
@ -2271,7 +2268,11 @@ v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
|
||||
|
||||
template<int n> inline
|
||||
void v_rshr_pack_store(int* ptr, const v_int64x4& a)
|
||||
{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(a.val, a.val, n)), ptr, 0); }
|
||||
{
|
||||
__m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
|
||||
__lasx_xvstelm_d(res, ptr, 0, 0);
|
||||
__lasx_xvstelm_d(res, ptr, 8, 2);
|
||||
}
|
||||
|
||||
// pack boolean
|
||||
inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
|
||||
|
@ -154,6 +154,12 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined __loongarch64
|
||||
#include "sys/auxv.h"
|
||||
#define LA_HWCAP_LSX (1<<4)
|
||||
#define LA_HWCAP_LASX (1<<5)
|
||||
#endif
|
||||
|
||||
#if defined _WIN32 || defined WINCE
|
||||
#ifndef _WIN32_WINNT // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
|
||||
#define _WIN32_WINNT 0x0400 // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
|
||||
@ -704,12 +710,11 @@ struct HWFeatures
|
||||
have[CV_CPU_RVV] = true;
|
||||
#endif
|
||||
|
||||
#if defined __loongarch_sx
|
||||
have[CV_CPU_LSX] = true;
|
||||
#endif
|
||||
#if defined __loongarch64 && defined __linux__
|
||||
int flag = (int)getauxval(AT_HWCAP);
|
||||
|
||||
#if defined __loongarch_asx
|
||||
have[CV_CPU_LASX] = true;
|
||||
have[CV_CPU_LSX] = (flag & LA_HWCAP_LSX) != 0;
|
||||
have[CV_CPU_LASX] = (flag & LA_HWCAP_LASX) != 0;
|
||||
#endif
|
||||
|
||||
bool skip_baseline_check = false;
|
||||
|
Loading…
Reference in New Issue
Block a user