opencv/3rdparty/ndsrvp/include/core.hpp
Junyan721113 d9421ac148
Merge pull request #25167 from plctlab:rvp_3rdparty
3rdparty: NDSRVP - A New 3rdparty Library with Optimizations Based on RISC-V P Extension v0.5.2 - Part 1: Basic Functions #25167

# Summary

### Previous context
From PR #24556: 

>> * As you wrote, the P-extension differs from RVV thus can not be easily implemented via Universal Intrinsics mechanism, but there is another HAL mechanism for lower-level CPU optimizations which is used by the [Carotene](https://github.com/opencv/opencv/tree/4.x/3rdparty/carotene) library on ARM platforms. I suggest moving all non-dnn code to similar third-party component. For example, FAST algorithm should allow such optimization-shortcut: see https://github.com/opencv/opencv/blob/4.x/modules/features2d/src/hal_replacement.hpp
>>   Reference documentation is here:
>>   
>>   * https://docs.opencv.org/4.x/d1/d1b/group__core__hal__interface.html
>>   * https://docs.opencv.org/4.x/dd/d8b/group__imgproc__hal__interface.html
>>   * https://docs.opencv.org/4.x/db/d47/group__features2d__hal__interface.html
>>   * Carotene library is turned on here: 8bbf08f0de/CMakeLists.txt (L906-L911)

> As a test outside of this PR, A 3rdparty component called ndsrvp is created, containing one of the non-dnn code (integral_SIMD), and it works very well.
> All the non-dnn code in this PR have been removed, currently this PR can be focused on dnn optinizations.
> This HAL mechanism is quite suitable for rvp optimizations, all the non-dnn code is expected to be moved into ndsrvp soon.

### Progress

#### Part 1 (This PR)

- [Core](https://docs.opencv.org/4.x/d1/d1b/group__core__hal__interface.html)
- [x] Element-wise add and subtract
- [x] Element-wise minimum or maximum
- [x] Element-wise absolute difference
- [x] Bitwise logical operations
- [x] Element-wise compare
- [ImgProc](https://docs.opencv.org/4.x/dd/d8b/group__imgproc__hal__interface.html)
- [x] Integral
- [x] Threshold
- [x] WarpAffine
- [x] WarpPerspective
- [Features2D](https://docs.opencv.org/4.x/db/d47/group__features2d__hal__interface.html)

#### Part 2 (Next PR)

**Rough Estimate. Todo List May Change.**

- [Core](https://docs.opencv.org/4.x/d1/d1b/group__core__hal__interface.html)
- [ImgProc](https://docs.opencv.org/4.x/dd/d8b/group__imgproc__hal__interface.html)
- smaller remap HAL interface
- AdaptiveThreshold
- BoxFilter
- Canny
- Convert
- Filter
- GaussianBlur
- MedianBlur
- Morph
- Pyrdown
- Resize
- Scharr
- SepFilter
- Sobel
- [Features2D](https://docs.opencv.org/4.x/db/d47/group__features2d__hal__interface.html)
- FAST

### Performance Tests

The optimization does not contain floating point opreations.

**Absolute Difference**

Geometric mean (ms)

|Name of Test|opencv perf core Absdiff|opencv perf core Absdiff|opencv perf core Absdiff vs opencv perf core Absdiff (x-factor)|
|---|:-:|:-:|:-:|
|Absdiff::OCL_AbsDiffFixture::(640x480, 8UC1)|23.104|5.972|3.87|
|Absdiff::OCL_AbsDiffFixture::(640x480, 32FC1)|39.500|40.830|0.97|
|Absdiff::OCL_AbsDiffFixture::(640x480, 8UC3)|69.155|15.051|4.59|
|Absdiff::OCL_AbsDiffFixture::(640x480, 32FC3)|118.715|120.509|0.99|
|Absdiff::OCL_AbsDiffFixture::(640x480, 8UC4)|93.001|19.770|4.70|
|Absdiff::OCL_AbsDiffFixture::(640x480, 32FC4)|161.136|160.791|1.00|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 8UC1)|69.211|15.140|4.57|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 32FC1)|118.762|119.263|1.00|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 8UC3)|212.414|44.692|4.75|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 32FC3)|367.512|366.569|1.00|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 8UC4)|285.337|59.708|4.78|
|Absdiff::OCL_AbsDiffFixture::(1280x720, 32FC4)|490.395|491.118|1.00|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 8UC1)|158.827|33.462|4.75|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 32FC1)|273.503|273.668|1.00|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 8UC3)|484.175|100.520|4.82|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 32FC3)|828.758|829.689|1.00|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 8UC4)|648.592|137.195|4.73|
|Absdiff::OCL_AbsDiffFixture::(1920x1080, 32FC4)|1116.755|1109.587|1.01|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 8UC1)|648.715|134.875|4.81|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 32FC1)|1115.939|1113.818|1.00|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 8UC3)|1944.791|413.420|4.70|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 32FC3)|3354.193|3324.672|1.01|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 8UC4)|2594.585|553.486|4.69|
|Absdiff::OCL_AbsDiffFixture::(3840x2160, 32FC4)|4473.543|4438.453|1.01|

**Bitwise Operation**

Geometric mean (ms)

|Name of Test|opencv perf core Bit|opencv perf core Bit|opencv perf core Bit vs opencv perf core Bit (x-factor)|
|---|:-:|:-:|:-:|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 8UC1)|22.542|4.971|4.53|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 32FC1)|90.210|19.917|4.53|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 8UC3)|68.429|15.037|4.55|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 32FC3)|280.168|59.239|4.73|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 8UC4)|90.565|19.735|4.59|
|Bitwise_and::OCL_BitwiseAndFixture::(640x480, 32FC4)|374.695|79.257|4.73|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 8UC1)|67.824|14.873|4.56|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 32FC1)|279.514|59.232|4.72|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 8UC3)|208.337|44.234|4.71|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 32FC3)|851.211|182.522|4.66|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 8UC4)|279.529|59.095|4.73|
|Bitwise_and::OCL_BitwiseAndFixture::(1280x720, 32FC4)|1132.065|244.877|4.62|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 8UC1)|155.685|33.078|4.71|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 32FC1)|635.253|137.482|4.62|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 8UC3)|474.494|100.166|4.74|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 32FC3)|1907.340|412.841|4.62|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 8UC4)|635.538|134.544|4.72|
|Bitwise_and::OCL_BitwiseAndFixture::(1920x1080, 32FC4)|2552.666|556.397|4.59|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 8UC1)|634.736|136.355|4.66|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 32FC1)|2548.283|561.827|4.54|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 8UC3)|1911.454|421.571|4.53|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 32FC3)|7663.803|1677.289|4.57|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 8UC4)|2543.983|562.780|4.52|
|Bitwise_and::OCL_BitwiseAndFixture::(3840x2160, 32FC4)|10211.693|2237.393|4.56|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 8UC1)|22.341|4.811|4.64|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 32FC1)|89.975|19.288|4.66|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 8UC3)|67.237|14.643|4.59|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 32FC3)|276.324|58.609|4.71|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 8UC4)|89.587|19.554|4.58|
|Bitwise_not::OCL_BitwiseNotFixture::(640x480, 32FC4)|370.986|77.136|4.81|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 8UC1)|67.227|14.541|4.62|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 32FC1)|276.357|58.076|4.76|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 8UC3)|206.752|43.376|4.77|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 32FC3)|841.638|177.787|4.73|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 8UC4)|276.773|57.784|4.79|
|Bitwise_not::OCL_BitwiseNotFixture::(1280x720, 32FC4)|1127.740|237.472|4.75|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 8UC1)|153.808|32.531|4.73|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 32FC1)|627.765|129.990|4.83|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 8UC3)|469.799|98.249|4.78|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 32FC3)|1893.591|403.694|4.69|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 8UC4)|627.724|129.962|4.83|
|Bitwise_not::OCL_BitwiseNotFixture::(1920x1080, 32FC4)|2529.967|540.744|4.68|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 8UC1)|628.089|130.277|4.82|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 32FC1)|2521.817|540.146|4.67|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 8UC3)|1905.004|404.704|4.71|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 32FC3)|7567.971|1627.898|4.65|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 8UC4)|2531.476|540.181|4.69|
|Bitwise_not::OCL_BitwiseNotFixture::(3840x2160, 32FC4)|10075.594|2181.654|4.62|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 8UC1)|22.566|5.076|4.45|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 32FC1)|90.391|19.928|4.54|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 8UC3)|67.758|14.740|4.60|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 32FC3)|279.253|59.844|4.67|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 8UC4)|90.296|19.802|4.56|
|Bitwise_or::OCL_BitwiseOrFixture::(640x480, 32FC4)|373.972|79.815|4.69|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 8UC1)|67.815|14.865|4.56|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 32FC1)|279.398|60.054|4.65|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 8UC3)|208.643|45.043|4.63|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 32FC3)|850.042|180.985|4.70|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 8UC4)|279.363|60.385|4.63|
|Bitwise_or::OCL_BitwiseOrFixture::(1280x720, 32FC4)|1134.858|243.062|4.67|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 8UC1)|155.212|33.155|4.68|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 32FC1)|634.985|134.911|4.71|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 8UC3)|474.648|100.407|4.73|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 32FC3)|1912.049|414.184|4.62|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 8UC4)|635.252|132.587|4.79|
|Bitwise_or::OCL_BitwiseOrFixture::(1920x1080, 32FC4)|2544.471|560.737|4.54|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 8UC1)|634.574|134.966|4.70|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 32FC1)|2545.129|561.498|4.53|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 8UC3)|1910.900|419.365|4.56|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 32FC3)|7662.603|1685.812|4.55|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 8UC4)|2548.971|560.787|4.55|
|Bitwise_or::OCL_BitwiseOrFixture::(3840x2160, 32FC4)|10201.407|2237.552|4.56|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 8UC1)|22.718|4.961|4.58|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 32FC1)|91.496|19.831|4.61|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 8UC3)|67.910|15.151|4.48|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 32FC3)|279.612|59.792|4.68|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 8UC4)|91.073|19.853|4.59|
|Bitwise_xor::OCL_BitwiseXorFixture::(640x480, 32FC4)|374.641|79.155|4.73|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 8UC1)|67.704|15.008|4.51|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 32FC1)|279.229|60.088|4.65|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 8UC3)|208.156|44.426|4.69|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 32FC3)|849.501|180.848|4.70|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 8UC4)|279.642|59.728|4.68|
|Bitwise_xor::OCL_BitwiseXorFixture::(1280x720, 32FC4)|1129.826|242.880|4.65|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 8UC1)|155.585|33.354|4.66|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 32FC1)|634.090|134.995|4.70|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 8UC3)|474.931|99.598|4.77|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 32FC3)|1910.519|413.138|4.62|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 8UC4)|635.026|135.155|4.70|
|Bitwise_xor::OCL_BitwiseXorFixture::(1920x1080, 32FC4)|2560.167|560.838|4.56|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 8UC1)|634.893|134.883|4.71|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 32FC1)|2548.166|560.831|4.54|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 8UC3)|1911.392|419.816|4.55|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 32FC3)|7646.634|1677.988|4.56|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 8UC4)|2560.637|560.805|4.57|
|Bitwise_xor::OCL_BitwiseXorFixture::(3840x2160, 32FC4)|10227.044|2249.458|4.55|

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2024-05-28 14:25:53 +03:00

533 lines
19 KiB
C++

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_NDSRVP_CORE_HPP
#define OPENCV_NDSRVP_CORE_HPP
namespace cv {
namespace ndsrvp {
template <typename srctype, typename dsttype,
typename vsrctype, typename vdsttype, int nlane,
template <typename src, typename dst> typename operators_t,
typename... params_t>
int elemwise_binop(const srctype* src1_data, size_t src1_step,
const srctype* src2_data, size_t src2_step,
dsttype* dst_data, size_t dst_step,
int width, int height, params_t... params)
{
src1_step /= sizeof(srctype);
src2_step /= sizeof(srctype);
dst_step /= sizeof(dsttype);
operators_t<srctype, dsttype> operators;
int i, j;
for (i = 0; i < height; ++i) {
const srctype* src1_row = src1_data + (src1_step * i);
const srctype* src2_row = src2_data + (src2_step * i);
dsttype* dst_row = dst_data + (dst_step * i);
j = 0;
for (; j + nlane <= width; j += nlane) {
register vsrctype vs1 = *(vsrctype*)(src1_row + j);
register vsrctype vs2 = *(vsrctype*)(src2_row + j);
*(vdsttype*)(dst_row + j) = operators.vector(vs1, vs2, params...);
}
for (; j < width; j++)
dst_row[j] = operators.scalar(src1_row[j], src2_row[j], params...);
}
return CV_HAL_ERROR_OK;
}
template <typename srctype, typename dsttype,
typename vsrctype, typename vdsttype, int nlane,
template <typename src, typename dst> typename operators_t,
typename... params_t>
int elemwise_unop(const srctype* src_data, size_t src_step,
dsttype* dst_data, size_t dst_step,
int width, int height, params_t... params)
{
src_step /= sizeof(srctype);
dst_step /= sizeof(dsttype);
operators_t<srctype, dsttype> operators;
int i, j;
for (i = 0; i < height; ++i) {
const srctype* src_row = src_data + (src_step * i);
dsttype* dst_row = dst_data + (dst_step * i);
j = 0;
for (; j + nlane <= width; j += nlane) {
register vsrctype vs = *(vsrctype*)(src_row + j);
*(vdsttype*)(dst_row + j) = operators.vector(vs, params...);
}
for (; j < width; j++)
dst_row[j] = operators.scalar(src_row[j], params...);
}
return CV_HAL_ERROR_OK;
}
// ################ add ################
template <typename src, typename dst>
struct operators_add_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_ukadd8(a, b); }
inline uchar scalar(uchar a, uchar b) { return __nds__ukadd8(a, b); }
inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_kadd8(a, b); }
inline schar scalar(schar a, schar b) { return __nds__kadd8(a, b); }
inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_ukadd16(a, b); }
inline ushort scalar(ushort a, ushort b) { return __nds__ukadd16(a, b); }
inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_kadd16(a, b); }
inline short scalar(short a, short b) { return __nds__kadd16(a, b); }
inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_kadd32(a, b); }
inline int scalar(int a, int b) { return __nds__kadd32(a, b); }
};
#undef cv_hal_add8u
#define cv_hal_add8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_add_t>)
#undef cv_hal_add8s
#define cv_hal_add8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_add_t>)
#undef cv_hal_add16u
#define cv_hal_add16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_add_t>)
#undef cv_hal_add16s
#define cv_hal_add16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_add_t>)
#undef cv_hal_add32s
#define cv_hal_add32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_add_t>)
// ################ sub ################
template <typename src, typename dst>
struct operators_sub_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_uksub8(a, b); }
inline uchar scalar(uchar a, uchar b) { return __nds__uksub8(a, b); }
inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_ksub8(a, b); }
inline schar scalar(schar a, schar b) { return __nds__ksub8(a, b); }
inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_uksub16(a, b); }
inline ushort scalar(ushort a, ushort b) { return __nds__uksub16(a, b); }
inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_ksub16(a, b); }
inline short scalar(short a, short b) { return __nds__ksub16(a, b); }
inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_ksub32(a, b); }
inline int scalar(int a, int b) { return __nds__ksub32(a, b); }
};
#undef cv_hal_sub8u
#define cv_hal_sub8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_sub_t>)
#undef cv_hal_sub8s
#define cv_hal_sub8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_sub_t>)
#undef cv_hal_sub16u
#define cv_hal_sub16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_sub_t>)
#undef cv_hal_sub16s
#define cv_hal_sub16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_sub_t>)
#undef cv_hal_sub32s
#define cv_hal_sub32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_sub_t>)
// ################ max ################
template <typename src, typename dst>
struct operators_max_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_umax8(a, b); }
inline uchar scalar(uchar a, uchar b) { return __nds__umax8(a, b); }
inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_smax8(a, b); }
inline schar scalar(schar a, schar b) { return __nds__smax8(a, b); }
inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_umax16(a, b); }
inline ushort scalar(ushort a, ushort b) { return __nds__umax16(a, b); }
inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_smax16(a, b); }
inline short scalar(short a, short b) { return __nds__smax16(a, b); }
inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_smax32(a, b); }
inline int scalar(int a, int b) { return __nds__smax32(a, b); }
};
#undef cv_hal_max8u
#define cv_hal_max8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_max_t>)
#undef cv_hal_max8s
#define cv_hal_max8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_max_t>)
#undef cv_hal_max16u
#define cv_hal_max16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_max_t>)
#undef cv_hal_max16s
#define cv_hal_max16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_max_t>)
#undef cv_hal_max32s
#define cv_hal_max32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_max_t>)
// ################ min ################
template <typename src, typename dst>
struct operators_min_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_umin8(a, b); }
inline uchar scalar(uchar a, uchar b) { return __nds__umin8(a, b); }
inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_smin8(a, b); }
inline schar scalar(schar a, schar b) { return __nds__smin8(a, b); }
inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_umin16(a, b); }
inline ushort scalar(ushort a, ushort b) { return __nds__umin16(a, b); }
inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_smin16(a, b); }
inline short scalar(short a, short b) { return __nds__smin16(a, b); }
inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_smin32(a, b); }
inline int scalar(int a, int b) { return __nds__smin32(a, b); }
};
#undef cv_hal_min8u
#define cv_hal_min8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_min_t>)
#undef cv_hal_min8s
#define cv_hal_min8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_min_t>)
#undef cv_hal_min16u
#define cv_hal_min16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_min_t>)
#undef cv_hal_min16s
#define cv_hal_min16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_min_t>)
#undef cv_hal_min32s
#define cv_hal_min32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_min_t>)
// ################ absdiff ################
template <typename src, typename dst>
struct operators_absdiff_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_uksub8(__nds__v_umax8(a, b), __nds__v_umin8(a, b)); }
inline uchar scalar(uchar a, uchar b) { return __nds__uksub8(__nds__umax8(a, b), __nds__umin8(a, b)); }
inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_ksub8(__nds__v_smax8(a, b), __nds__v_smin8(a, b)); }
inline schar scalar(schar a, schar b) { return __nds__ksub8(__nds__smax8(a, b), __nds__smin8(a, b)); }
inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_uksub16(__nds__v_umax16(a, b), __nds__v_umin16(a, b)); }
inline ushort scalar(ushort a, ushort b) { return __nds__uksub16(__nds__umax16(a, b), __nds__umin16(a, b)); }
inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_ksub16(__nds__v_smax16(a, b), __nds__v_smin16(a, b)); }
inline short scalar(short a, short b) { return __nds__ksub16(__nds__smax16(a, b), __nds__smin16(a, b)); }
inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_ksub32(__nds__v_smax32(a, b), __nds__v_smin32(a, b)); }
inline int scalar(int a, int b) { return __nds__ksub32(__nds__smax32(a, b), __nds__smin32(a, b)); }
};
#undef cv_hal_absdiff8u
#define cv_hal_absdiff8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_absdiff_t>)
#undef cv_hal_absdiff8s
#define cv_hal_absdiff8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_absdiff_t>)
#undef cv_hal_absdiff16u
#define cv_hal_absdiff16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_absdiff_t>)
#undef cv_hal_absdiff16s
#define cv_hal_absdiff16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_absdiff_t>)
#undef cv_hal_absdiff32s
#define cv_hal_absdiff32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_absdiff_t>)
// ################ bitwise ################
template <typename src, typename dst>
struct operators_and_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a & b; }
inline uchar scalar(uchar a, uchar b) { return a & b; }
};
#undef cv_hal_and8u
#define cv_hal_and8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_and_t>)
template <typename src, typename dst>
struct operators_or_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a | b; }
inline uchar scalar(uchar a, uchar b) { return a | b; }
};
#undef cv_hal_or8u
#define cv_hal_or8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_or_t>)
template <typename src, typename dst>
struct operators_xor_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a ^ b; }
inline uchar scalar(uchar a, uchar b) { return a ^ b; }
};
#undef cv_hal_xor8u
#define cv_hal_xor8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_xor_t>)
template <typename src, typename dst>
struct operators_not_t {
inline uint8x8_t vector(uint8x8_t a) { return ~a; }
inline uchar scalar(uchar a) { return ~a; }
};
#undef cv_hal_not8u
#define cv_hal_not8u (cv::ndsrvp::elemwise_unop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_not_t>)
// ################ cmp ################
template <typename src, typename dst>
struct operators_cmp_t {
inline uint8x8_t vector(uint8x8_t a, uint8x8_t b, int operation)
{
switch (operation) {
case CV_HAL_CMP_EQ:
return __nds__v_ucmpeq8(a, b);
case CV_HAL_CMP_GT:
return __nds__v_ucmplt8(b, a);
case CV_HAL_CMP_GE:
return __nds__v_ucmple8(b, a);
case CV_HAL_CMP_LT:
return __nds__v_ucmplt8(a, b);
case CV_HAL_CMP_LE:
return __nds__v_ucmple8(a, b);
case CV_HAL_CMP_NE:
return ~__nds__v_ucmpeq8(a, b);
default:
return uint8x8_t();
}
}
inline uchar scalar(uchar a, uchar b, int operation)
{
switch (operation) {
case CV_HAL_CMP_EQ:
return __nds__cmpeq8(a, b);
case CV_HAL_CMP_GT:
return __nds__ucmplt8(b, a);
case CV_HAL_CMP_GE:
return __nds__ucmple8(b, a);
case CV_HAL_CMP_LT:
return __nds__ucmplt8(a, b);
case CV_HAL_CMP_LE:
return __nds__ucmple8(a, b);
case CV_HAL_CMP_NE:
return ~__nds__cmpeq8(a, b);
default:
return 0;
}
}
inline uint8x8_t vector(int8x8_t a, int8x8_t b, int operation)
{
switch (operation) {
case CV_HAL_CMP_EQ:
return __nds__v_scmpeq8(a, b);
case CV_HAL_CMP_GT:
return __nds__v_scmplt8(b, a);
case CV_HAL_CMP_GE:
return __nds__v_scmple8(b, a);
case CV_HAL_CMP_LT:
return __nds__v_scmplt8(a, b);
case CV_HAL_CMP_LE:
return __nds__v_scmple8(a, b);
case CV_HAL_CMP_NE:
return ~__nds__v_scmpeq8(a, b);
default:
return uint8x8_t();
}
}
inline uchar scalar(schar a, schar b, int operation)
{
switch (operation) {
case CV_HAL_CMP_EQ:
return __nds__cmpeq8(a, b);
case CV_HAL_CMP_GT:
return __nds__scmplt8(b, a);
case CV_HAL_CMP_GE:
return __nds__scmple8(b, a);
case CV_HAL_CMP_LT:
return __nds__scmplt8(a, b);
case CV_HAL_CMP_LE:
return __nds__scmple8(a, b);
case CV_HAL_CMP_NE:
return ~__nds__cmpeq8(a, b);
default:
return 0;
}
}
inline uint8x4_t vector(uint16x4_t a, uint16x4_t b, int operation)
{
register unsigned long cmp;
switch (operation) {
case CV_HAL_CMP_EQ:
cmp = (unsigned long)__nds__v_ucmpeq16(a, b) >> 8;
break;
case CV_HAL_CMP_GT:
cmp = (unsigned long)__nds__v_ucmplt16(b, a) >> 8;
break;
case CV_HAL_CMP_GE:
cmp = (unsigned long)__nds__v_ucmple16(b, a) >> 8;
break;
case CV_HAL_CMP_LT:
cmp = (unsigned long)__nds__v_ucmplt16(a, b) >> 8;
break;
case CV_HAL_CMP_LE:
cmp = (unsigned long)__nds__v_ucmple16(a, b) >> 8;
break;
case CV_HAL_CMP_NE:
cmp = ~(unsigned long)__nds__v_ucmpeq16(a, b) >> 8;
break;
default:
return uint8x4_t();
}
return (uint8x4_t)(unsigned int)__nds__pkbb16(cmp >> 32, cmp);
}
inline uchar scalar(ushort a, ushort b, int operation)
{
switch (operation) {
case CV_HAL_CMP_EQ:
return __nds__cmpeq16(a, b);
case CV_HAL_CMP_GT:
return __nds__ucmplt16(b, a);
case CV_HAL_CMP_GE:
return __nds__ucmple16(b, a);
case CV_HAL_CMP_LT:
return __nds__ucmplt16(a, b);
case CV_HAL_CMP_LE:
return __nds__ucmple16(a, b);
case CV_HAL_CMP_NE:
return ~__nds__cmpeq16(a, b);
default:
return 0;
}
}
inline uint8x4_t vector(int16x4_t a, int16x4_t b, int operation)
{
register unsigned long cmp;
switch (operation) {
case CV_HAL_CMP_EQ:
cmp = (unsigned long)__nds__v_scmpeq16(a, b) >> 8;
break;
case CV_HAL_CMP_GT:
cmp = (unsigned long)__nds__v_scmplt16(b, a) >> 8;
break;
case CV_HAL_CMP_GE:
cmp = (unsigned long)__nds__v_scmple16(b, a) >> 8;
break;
case CV_HAL_CMP_LT:
cmp = (unsigned long)__nds__v_scmplt16(a, b) >> 8;
break;
case CV_HAL_CMP_LE:
cmp = (unsigned long)__nds__v_scmple16(a, b) >> 8;
break;
case CV_HAL_CMP_NE:
cmp = ~(unsigned long)__nds__v_scmpeq16(a, b) >> 8;
break;
default:
return uint8x4_t();
}
return (uint8x4_t)(unsigned int)__nds__pkbb16(cmp >> 32, cmp);
}
inline uchar scalar(short a, short b, int operation)
{
switch (operation) {
case CV_HAL_CMP_EQ:
return __nds__cmpeq16(a, b);
case CV_HAL_CMP_GT:
return __nds__scmplt16(b, a);
case CV_HAL_CMP_GE:
return __nds__scmple16(b, a);
case CV_HAL_CMP_LT:
return __nds__scmplt16(a, b);
case CV_HAL_CMP_LE:
return __nds__scmple16(a, b);
case CV_HAL_CMP_NE:
return ~__nds__cmpeq16(a, b);
default:
return 0;
}
}
};
#undef cv_hal_cmp8u
#define cv_hal_cmp8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_cmp_t>)
#undef cv_hal_cmp8s
#define cv_hal_cmp8s (cv::ndsrvp::elemwise_binop<schar, uchar, int8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_cmp_t>)
#undef cv_hal_cmp16u
#define cv_hal_cmp16u (cv::ndsrvp::elemwise_binop<ushort, uchar, uint16x4_t, uint8x4_t, 4, cv::ndsrvp::operators_cmp_t>)
#undef cv_hal_cmp16s
#define cv_hal_cmp16s (cv::ndsrvp::elemwise_binop<short, uchar, int16x4_t, uint8x4_t, 4, cv::ndsrvp::operators_cmp_t>)
// ################ split ################
/*template <typename srctype, typename vsrctype, int nlane>
int split(const srctype* src_data, srctype** dst_data, int len, int cn)
{
int i, j;
for (i = 0; i < len; i++) {
for (j = 0; j < cn; j++) {
dst_data[j][i] = src_data[i * cn + j];
}
}
return CV_HAL_ERROR_OK;
}
#undef cv_hal_split8u
#define cv_hal_split8u (cv::ndsrvp::split<uchar, uint8x8_t, 8>)
#undef cv_hal_split16u
#define cv_hal_split16u (cv::ndsrvp::split<ushort, uint16x4_t, 4>)
#undef cv_hal_split32s
#define cv_hal_split32s (cv::ndsrvp::split<int, int32x2_t, 2>)*/
// ################ merge ################
/*template <typename srctype, typename vsrctype, int nlane>
int merge(const srctype** src_data, srctype* dst_data, int len, int cn)
{
int i, j;
for (i = 0; i < len; i++) {
for (j = 0; j < cn; j++) {
dst_data[i * cn + j] = src_data[j][i];
}
}
return CV_HAL_ERROR_OK;
}
#undef cv_hal_merge8u
#define cv_hal_merge8u (cv::ndsrvp::merge<uchar, uint8x8_t, 8>)
#undef cv_hal_merge16u
#define cv_hal_merge16u (cv::ndsrvp::merge<ushort, uint16x4_t, 4>)
#undef cv_hal_merge32s
#define cv_hal_merge32s (cv::ndsrvp::merge<int, int32x2_t, 2>)*/
} // namespace ndsrvp
} // namespace cv
#endif