diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index d45f327beb..32f176e930 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -403,11 +403,8 @@ elseif(LOONGARCH64) ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX") ocv_update(CPU_LSX_FLAGS_ON "-mlsx") ocv_update(CPU_LASX_FLAGS_ON "-mlasx") - if("${CPU_BASELINE_DISABLE}" STREQUAL "LASX") - set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}") - else() - set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}") - endif() + set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}") + set(CPU_DISPATCH "LASX" CACHE STRING "${HELP_CPU_DISPATCH}") endif() diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index ba5b61ef5f..4d5ebf3483 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,21 +1,21 @@ set(the_description "The Core Functionality") -ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) -ocv_add_dispatched_file(stat SSE4_2 AVX2) -ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) -ocv_add_dispatched_file(convert SSE2 AVX2 VSX3) -ocv_add_dispatched_file(convert_scale SSE2 AVX2) -ocv_add_dispatched_file(count_non_zero SSE2 AVX2) -ocv_add_dispatched_file(has_non_zero SSE2 AVX2) -ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD) -ocv_add_dispatched_file(mean SSE2 AVX2) -ocv_add_dispatched_file(merge SSE2 AVX2) -ocv_add_dispatched_file(split SSE2 AVX2) -ocv_add_dispatched_file(sum SSE2 AVX2) +ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2 LASX) +ocv_add_dispatched_file(stat SSE4_2 AVX2 LASX) +ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3 LASX) +ocv_add_dispatched_file(convert SSE2 AVX2 VSX3 LASX) +ocv_add_dispatched_file(convert_scale SSE2 AVX2 LASX) +ocv_add_dispatched_file(count_non_zero SSE2 AVX2 LASX) +ocv_add_dispatched_file(has_non_zero SSE2 AVX2 LASX ) +ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD LASX) +ocv_add_dispatched_file(mean SSE2 AVX2 LASX) +ocv_add_dispatched_file(merge SSE2 AVX2 LASX) +ocv_add_dispatched_file(split SSE2 AVX2 LASX) +ocv_add_dispatched_file(sum SSE2 AVX2 LASX) # dispatching for accuracy tests ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX) -ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX) +ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX LASX) ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 3968cba8f0..7897fb503f 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -246,12 +246,6 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #include "opencv2/core/hal/intrin_lsx.hpp" -#elif CV_LASX - #if !defined(CV_FORCE_SIMD128_CPP) - #define CV_FORCE_SIMD128_CPP 1 - #endif -#include "opencv2/core/hal/intrin_cpp.hpp" - #else #include "opencv2/core/hal/intrin_cpp.hpp" diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp index 5214e80743..6546d6db7d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp @@ -1419,20 +1419,6 @@ inline v_uint32x8 v_popcount(const v_int32x8& a) inline v_uint64x4 v_popcount(const v_int64x4& a) { return v_popcount(v_reinterpret_as_u64(a)); } -/** Mask **/ -#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \ -inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; } -OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar) -OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar) -OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short) -OPENCV_HAL_IMPL_REINTERPRET_INT(short, short) -OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int) -OPENCV_HAL_IMPL_REINTERPRET_INT(int, int) -OPENCV_HAL_IMPL_REINTERPRET_INT(float, int) -OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64) -OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64) -OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64) - inline int v_signmask(const v_int8x32& a) { __m256i result = __lasx_xvmskltz_b(a.val); @@ -2151,7 +2137,8 @@ template inline void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a) { __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n); - __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); } template inline @@ -2165,7 +2152,8 @@ template inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a) { __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n); - __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); } template inline @@ -2179,7 +2167,8 @@ template inline void v_rshr_pack_store(schar* ptr, const v_int16x16& a) { __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n); - __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); } // 32 @@ -2198,7 +2187,8 @@ inline void v_pack_store(short* ptr, const v_int32x8& a) inline void v_pack_store(ushort* ptr, const v_uint32x8& a) { __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0); - __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); } inline void v_pack_u_store(ushort* ptr, const v_int32x8& a) @@ -2212,7 +2202,8 @@ template inline void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a) { __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n); - __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); } template inline @@ -2223,7 +2214,8 @@ template inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a) { __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n); - __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); } template inline @@ -2234,7 +2226,8 @@ template inline void v_rshr_pack_store(short* ptr, const v_int32x8& a) { __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n); - __lsx_vst(_v256_extract_low(_v256_shuffle_odd_64(res)), ptr, 0); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); } // 64 @@ -2263,7 +2256,11 @@ v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b) template inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a) -{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(a.val, a.val, n)), ptr, 0); } +{ + __m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); +} template inline v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b) @@ -2271,7 +2268,11 @@ v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b) template inline void v_rshr_pack_store(int* ptr, const v_int64x4& a) -{ __lsx_vst(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(a.val, a.val, n)), ptr, 0); } +{ + __m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n); + __lasx_xvstelm_d(res, ptr, 0, 0); + __lasx_xvstelm_d(res, ptr, 8, 2); +} // pack boolean inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 8d651b8c8d..9f67d92a43 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -154,6 +154,12 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); } # endif #endif +#if defined __loongarch64 +#include "sys/auxv.h" +#define LA_HWCAP_LSX (1<<4) +#define LA_HWCAP_LASX (1<<5) +#endif + #if defined _WIN32 || defined WINCE #ifndef _WIN32_WINNT // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?) #define _WIN32_WINNT 0x0400 // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx @@ -704,12 +710,11 @@ struct HWFeatures have[CV_CPU_RVV] = true; #endif - #if defined __loongarch_sx - have[CV_CPU_LSX] = true; - #endif + #if defined __loongarch64 && defined __linux__ + int flag = (int)getauxval(AT_HWCAP); - #if defined __loongarch_asx - have[CV_CPU_LASX] = true; + have[CV_CPU_LSX] = (flag & LA_HWCAP_LSX) != 0; + have[CV_CPU_LASX] = (flag & LA_HWCAP_LASX) != 0; #endif bool skip_baseline_check = false;