diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index bfb37dc23c..11365a33f7 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -362,7 +362,7 @@ void cv::Canny( InputArray _src, OutputArray _dst, } } #elif CV_NEON - for ( ; j < width - 8; j += 8) + for ( ; j <= width - 8; j += 8) { int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); vst1q_s32(_norm + j, vaddq_s32(vmovl_s16(vget_low_s16(v_dx)), vmovl_s16(vget_low_s16(v_dy)))); @@ -394,7 +394,7 @@ void cv::Canny( InputArray _src, OutputArray _dst, } } #elif CV_NEON - for ( ; j < width - 8; j += 8) + for ( ; j <= width - 8; j += 8) { int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); int32x4_t v_dxp = vmovl_s16(vget_low_s16(v_dx)), v_dyp = vmovl_s16(vget_low_s16(v_dy)); diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index 6318784278..01265a5b6e 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -146,6 +146,18 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) _mm_storeu_ps(dst + j, a); } } + #elif CV_NEON + float32x4_t v_k = vdupq_n_f32((float)k)); + + for( ; j <= size.width - 4; j += 4 ) + { + float32x4x3_t v_src = vld3q_f32(cov + j + 3); + float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2]; + float32x4_t v_ac_bb = vsubq_f32(vmulq_f32(v_a, v_c), vmulq_f32(v_b, v_b)); + float32x4_t v_ac = vaddq_f32(v_a, v_c); + float32x4_t v_prod = vmulq_f32(v_k, vmulq_f32(v_ac, v_ac)); + vst1q_f32(dst + j, vsubq_f32(v_ac_bb, v_prod)); + } #endif for( ; j < size.width; j++ ) @@ -641,6 +653,15 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord _mm_storeu_ps(dstdata + j, v_s1); } } +#elif CV_NEON + for( ; j <= size.width - 4; j += 4 ) + { + float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j); + float32x4_t v_s1 = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j))); + float32x4_t v_s2 = vmulq_f32(v_dy, vmulq_f32(v_dy, vld1q_f32(d2xdata + j))); + float32x4_t v_s3 = vmulq_f32(v_dx, vmulq_f32(v_dy, vld1q_f32(dxydata + j))); + vst1q_f32(dstdata + j, vaddq_f32(vaddq_f32(v_s1, v_s2), vmulq_n_f32(v_s3, -2.0f))); + } #endif for( ; j < size.width; j++ )