From a2d784b6f53aa1fdfde21ab8e3787a93b59af24f Mon Sep 17 00:00:00 2001 From: amane-ame Date: Fri, 7 Mar 2025 20:56:04 +0800 Subject: [PATCH] Add sepFilter. Co-authored-by: Liutong HAN --- 3rdparty/hal_rvv/hal_rvv_1p0/filter.hpp | 244 ++++++++++++++++++++++-- 1 file changed, 228 insertions(+), 16 deletions(-) diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/filter.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/filter.hpp index abe6d35d0c..09ba023021 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/filter.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/filter.hpp @@ -37,10 +37,10 @@ private: }; template -static inline int invoke(int height, std::function func, Args&&... args) +static inline int invoke(int start, int end, std::function func, Args&&... args) { - cv::parallel_for_(Range(1, height), FilterInvoker(func, std::forward(args)...), cv::getNumThreads()); - return func(0, 1, std::forward(args)...); + cv::parallel_for_(Range(start + 1, end), FilterInvoker(func, std::forward(args)...), cv::getNumThreads()); + return func(start, start + 1, std::forward(args)...); } struct Filter2D @@ -182,21 +182,22 @@ static inline int filter(int start, int end, Filter2D* data, const uchar* src_da kernel[i] = reinterpret_cast(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize]; } + constexpr int noval = std::numeric_limits::max(); auto access = [&](int x, int y) { int pi, pj; if (data->borderType & BORDER_ISOLATED) { pi = borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED); pj = borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED); - if (pi >= 0) - pi += offset_y; - if (pj >= 0) - pj += offset_x; + pi = pi < 0 ? noval : pi; + pj = pj < 0 ? noval : pj; } else { pi = borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType); pj = borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType); + pi = pi < 0 ? noval : pi - offset_y; + pj = pj < 0 ? noval : pj - offset_x; } return std::make_pair(pi, pj); }; @@ -207,7 +208,7 @@ static inline int filter(int start, int end, Filter2D* data, const uchar* src_da for (int i = 0; i < ksize * ksize; i++) { auto p = access(x + i / ksize, y + i % ksize); - if (p.first >= 0 && p.second >= 0) + if (p.first != noval && p.second != noval) { sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4 ]; sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1]; @@ -236,17 +237,17 @@ static inline int filter(int start, int end, Filter2D* data, const uchar* src_da for (int j = right; j < width; j++) process(i, j); - const uchar* row0 = access(i , 0).first < 0 ? nullptr : src_data + access(i , 0).first * src_step; - const uchar* row1 = access(i + 1, 0).first < 0 ? nullptr : src_data + access(i + 1, 0).first * src_step; - const uchar* row2 = access(i + 2, 0).first < 0 ? nullptr : src_data + access(i + 2, 0).first * src_step; + const uchar* row0 = access(i , 0).first == noval ? nullptr : src_data + access(i , 0).first * src_step; + const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step; + const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step; if (ksize == 3) { process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4); } else { - const uchar* row3 = access(i + 3, 0).first < 0 ? nullptr : src_data + access(i + 3, 0).first * src_step; - const uchar* row4 = access(i + 4, 0).first < 0 ? nullptr : src_data + access(i + 4, 0).first * src_step; + const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step; + const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step; process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4); } } @@ -257,7 +258,6 @@ static inline int filter(int start, int end, Filter2D* data, const uchar* src_da inline int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) { - src_data -= offset_y * src_step - offset_x * 4; Filter2D* data = reinterpret_cast(context); std::vector dst(width * height * 4); @@ -265,10 +265,10 @@ inline int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, ucha switch (data->kernel_width) { case 3: - res = invoke(height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y); + res = invoke(0, height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y); break; case 5: - res = invoke(height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y); + res = invoke(0, height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y); break; } @@ -284,6 +284,218 @@ inline int filterFree(cvhalFilter2D* context) } } // cv::cv_hal_rvv::filter +namespace sepFilter { +#undef cv_hal_sepFilterInit +#undef cv_hal_sepFilter +#undef cv_hal_sepFilterFree +#define cv_hal_sepFilterInit cv::cv_hal_rvv::sepFilter::sepFilterInit +#define cv_hal_sepFilter cv::cv_hal_rvv::sepFilter::sepFilter +#define cv_hal_sepFilterFree cv::cv_hal_rvv::sepFilter::sepFilterFree + +struct sepFilter2D +{ + int src_type; + int dst_type; + int kernel_type; + const uchar* kernelx_data; + int kernelx_length; + const uchar* kernely_data; + int kernely_length; + int anchor_x; + int anchor_y; + double delta; + int borderType; +}; + +inline int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar *kernelx_data, int kernelx_length, uchar *kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType) +{ + if (kernel_type != CV_32FC1 || src_type != CV_8UC1 || (dst_type != CV_16SC1 && dst_type != CV_32FC1)) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (kernelx_length != kernely_length) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (kernelx_length != 3 && kernelx_length != 5) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x; + anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y; + *context = reinterpret_cast(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED}); + return CV_HAL_ERROR_OK; +} + +template +static inline int sepFilterRow(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, float* dst_data, int width, int full_width, int offset_x) +{ + constexpr int noval = std::numeric_limits::max(); + auto access = [&](int y) { + int pj; + if (data->borderType & BORDER_ISOLATED) + { + pj = borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED); + pj = pj < 0 ? noval : pj; + } + else + { + pj = borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType); + pj = pj < 0 ? noval : pj - offset_x; + } + return pj; + }; + + const float* kx = reinterpret_cast(data->kernelx_data); + auto process = [&](int x, int y) { + float sum = 0; + for (int i = 0; i < ksize; i++) + { + int p = access(y + i); + if (p != noval) + { + sum += kx[i] * src_data[x * src_step + p]; + } + } + dst_data[x * width + y] = sum; + }; + + for (int i = start; i < end; i++) + { + const int left = ksize - 1, right = width - (ksize - 1); + if (left >= right) + { + for (int j = 0; j < width; j++) + process(i, j); + } + else + { + for (int j = 0; j < left; j++) + process(i, j); + for (int j = right; j < width; j++) + process(i, j); + + int vl; + for (int j = left; j < right; j += vl) + { + vl = __riscv_vsetvl_e8m2(right - j); + const uchar* extra = src_data + i * src_step + j - data->anchor_x; + auto sum = __riscv_vfmv_v_f_f32m8(0, vl); + auto src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(extra, vl), vl), vl); + sum = __riscv_vfmacc(sum, kx[0], src, vl); + src = __riscv_vfslide1down(src, extra[vl], vl); + sum = __riscv_vfmacc(sum, kx[1], src, vl); + src = __riscv_vfslide1down(src, extra[vl + 1], vl); + sum = __riscv_vfmacc(sum, kx[2], src, vl); + if (ksize == 5) + { + src = __riscv_vfslide1down(src, extra[vl + 2], vl); + sum = __riscv_vfmacc(sum, kx[3], src, vl); + src = __riscv_vfslide1down(src, extra[vl + 3], vl); + sum = __riscv_vfmacc(sum, kx[4], src, vl); + } + __riscv_vse32(dst_data + i * width + j, sum, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +template +static inline int sepFilterCol(int start, int end, sepFilter2D* data, const float* src_data, uchar* dst_data, size_t dst_step, int width, int height, int full_height, int offset_y) +{ + constexpr int noval = std::numeric_limits::max(); + auto access = [&](int x) { + int pi; + if (data->borderType & BORDER_ISOLATED) + { + pi = borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED); + pi = pi < 0 ? noval : pi; + } + else + { + pi = borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType); + pi = pi < 0 ? noval : pi - offset_y; + } + return pi; + }; + + const float* ky = reinterpret_cast(data->kernely_data); + for (int i = start; i < end; i++) + { + const float* row0 = access(i ) == noval ? nullptr : src_data + access(i ) * width; + const float* row1 = access(i + 1) == noval ? nullptr : src_data + access(i + 1) * width; + const float* row2 = access(i + 2) == noval ? nullptr : src_data + access(i + 2) * width; + const float* row3, *row4; + if (ksize == 5) + { + row3 = access(i + 3) == noval ? nullptr : src_data + access(i + 3) * width; + row4 = access(i + 4) == noval ? nullptr : src_data + access(i + 4) * width; + } + + int vl; + for (int j = 0; j < width; j += vl) + { + vl = __riscv_vsetvl_e32m4(width - j); + auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl); + + if (ksize == 5) + { + auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl); + sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl); + } + if (data->dst_type == CV_16SC1) + { + __riscv_vse16(reinterpret_cast(dst_data + i * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl); + } + else + { + __riscv_vse32(reinterpret_cast(dst_data + i * dst_step) + j, sum, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +inline int sepFilter(cvhalFilter2D *context, uchar *src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) +{ + sepFilter2D* data = reinterpret_cast(context); + const int padding = data->kernelx_length - 1; + std::vector _result(width * (height + 2 * padding)); + float* result = _result.data() + width * padding; + + int res = CV_HAL_ERROR_NOT_IMPLEMENTED; + switch (data->kernelx_length) + { + case 3: + res = filter::invoke(-std::min(offset_y, padding), height + std::min(full_height - height - offset_y, padding), {sepFilterRow<3>}, data, src_data, src_step, result, width, full_width, offset_x); + break; + case 5: + res = filter::invoke(-std::min(offset_y, padding), height + std::min(full_height - height - offset_y, padding), {sepFilterRow<5>}, data, src_data, src_step, result, width, full_width, offset_x); + break; + } + if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + switch (data->kernelx_length) + { + case 3: + return filter::invoke(0, height, {sepFilterCol<3>}, data, result, dst_data, dst_step, width, height, full_height, offset_y); + case 5: + return filter::invoke(0, height, {sepFilterCol<5>}, data, result, dst_data, dst_step, width, height, full_height, offset_y); + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + +inline int sepFilterFree(cvhalFilter2D* context) +{ + delete reinterpret_cast(context); + return CV_HAL_ERROR_OK; +} +} // cv::cv_hal_rvv::sepFilter + }} #endif