From 4dfb613c3e31d014b6b34b574efff93968c31d60 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 28 Jul 2014 14:54:41 +0400 Subject: [PATCH 1/4] optimized Bayer=>RGB/RGBA/Gray conversion using Neon intrinsics. Fixed recently introduced build error in iOS framework. --- modules/imgproc/src/demosaicing.cpp | 182 +++++++++++++++++++++++++++- modules/videoio/CMakeLists.txt | 2 +- 2 files changed, 182 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 9326fa1932..3182c19db2 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -65,6 +65,11 @@ public: { return 0; } + + int bayer2RGBA(const T*, int, T*, int, int) const + { + return 0; + } int bayer2RGB_EA(const T*, int, T*, int, int) const { @@ -218,6 +223,11 @@ public: return (int)(bayer - (bayer_end - width)); } + int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + { + return 0; + } + int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const { if (!use_simd) @@ -323,6 +333,174 @@ public: bool use_simd; }; +#elif CV_NEON +class SIMDBayerInterpolator_8u +{ +public: + SIMDBayerInterpolator_8u() + { + } + + int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst, + int width, int bcoeff, int gcoeff, int rcoeff) const + { + /* + B G B G | B G B G | B G B G | B G B G + G R G R | G R G R | G R G R | G R G R + B G B G | B G B G | B G B G | B G B G + */ + + uint16x8_t masklo = vdupq_n_s16(255); + const uchar* bayer_end = bayer + width; + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 ) + { + uint16x8_t r0 = vld1q_u16((const ushort*)bayer); + uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step)); + uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2)); + + uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo)); + uint16x8_t nextb1 = vextq_u16(b1, b1, 1); + uint16x8_t b0 = vaddq_u16(b1, nextb1); + b1 = vshlq_n_u16(nextb1, 1); + // b0 = b0 b2 b4 ... + // b1 = b1 b3 b5 ... + + uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8)); + uint16x8_t g1 = vandq_u16(r1, masklo); + g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1))); + g1 = vshlq_n_u16(vextq_u16(g1, g1, 1), 2); + // g0 = b0 b2 b4 ... + // g1 = b1 b3 b5 ... + + r0 = vshrq_n_u16(r1, 8); + r1 = vshlq_n_u16(vaddq_u16(r0, vextq_u16(r0, r0, 1)), 1); + r0 = vshlq_n_u16(r0, 2); + // r0 = r0 r2 r4 ... + // r1 = r1 r3 r5 ... + + b0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b0), (short)(rcoeff*2))); + b1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b1), (short)(rcoeff*2))); + + g0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g0), (short)(gcoeff*2))); + g1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g1), (short)(gcoeff*2))); + + r0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r0), (short)(bcoeff*2))); + r1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r1), (short)(bcoeff*2))); + + g0 = vshrq_n_u16(vaddq_u16(vaddq_u16(g0, b0), r0), 2); + g1 = vshrq_n_u16(vaddq_u16(vaddq_u16(g1, b1), r1), 2); + + uint8x8x2_t p = vzip_u8(vqmovn_u16(g0), vqmovn_u16(g1)); + vst1_u8(dst, p.val[0]); + vst1_u8(dst + 8, p.val[1]); + } + + return (int)(bayer - (bayer_end - width)); + } + + int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + { + /* + B G B G | B G B G | B G B G | B G B G + G R G R | G R G R | G R G R | G R G R + B G B G | B G B G | B G B G | B G B G + */ + uint16x8_t masklo = vdupq_n_u16(255); + uint8x16x3_t pix; + const uchar* bayer_end = bayer + width; + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 ) + { + uint16x8_t r0 = vld1q_u16((const ushort*)bayer); + uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step)); + uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2)); + + uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo)); + uint16x8_t nextb1 = vextq_u16(b1, b1, 1); + uint16x8_t b0 = vaddq_u16(b1, nextb1); + b1 = vrshrq_n_u16(nextb1, 1); + b0 = vrshrq_n_u16(b0, 2); + // b0 b1 b2 ... + uint8x8x2_t bb = vzip_u8(vmovn_u16(b0), vmovn_u16(b1)); + pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]); + + uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8)); + uint16x8_t g1 = vandq_u16(r1, masklo); + g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1))); + g1 = vextq_u16(g1, g1, 1); + g0 = vrshrq_n_u16(g0, 2); + // g0 g1 g2 ... + uint8x8x2_t gg = vzip_u8(vmovn_u16(g0), vmovn_u16(g1)); + pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]); + + r0 = vshrq_n_u16(r1, 8); + r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1)); + r1 = vrshrq_n_u16(r1, 1); + // r0 r1 r2 ... + uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vmovn_u16(r1)); + pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]); + + vst3q_u8(dst-1, pix); + } + + return (int)(bayer - (bayer_end - width)); + } + + int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + { + /* + B G B G | B G B G | B G B G | B G B G + G R G R | G R G R | G R G R | G R G R + B G B G | B G B G | B G B G | B G B G + */ + uint16x8_t masklo = vdupq_n_u16(255); + uint8x16x4_t pix; + const uchar* bayer_end = bayer + width; + pix.val[3] = vdupq_n_u8(255); + + for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 ) + { + uint16x8_t r0 = vld1q_u16((const ushort*)bayer); + uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step)); + uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2)); + + uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo)); + uint16x8_t nextb1 = vextq_u16(b1, b1, 1); + uint16x8_t b0 = vaddq_u16(b1, nextb1); + b1 = vrshrq_n_u16(nextb1, 1); + b0 = vrshrq_n_u16(b0, 2); + // b0 b1 b2 ... + uint8x8x2_t bb = vzip_u8(vmovn_u16(b0), vmovn_u16(b1)); + pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]); + + uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8)); + uint16x8_t g1 = vandq_u16(r1, masklo); + g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1))); + g1 = vextq_u16(g1, g1, 1); + g0 = vrshrq_n_u16(g0, 2); + // g0 g1 g2 ... + uint8x8x2_t gg = vzip_u8(vmovn_u16(g0), vmovn_u16(g1)); + pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]); + + r0 = vshrq_n_u16(r1, 8); + r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1)); + r1 = vrshrq_n_u16(r1, 1); + // r0 r1 r2 ... + uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vmovn_u16(r1)); + pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]); + + vst4q_u8(dst-1, pix); + } + + return (int)(bayer - (bayer_end - width)); + } + + int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + { + return 0; + } +}; #else typedef SIMDBayerStubInterpolator_ SIMDBayerInterpolator_8u; #endif @@ -559,7 +737,9 @@ public: } // simd optimization only for dcn == 3 - int delta = dcn == 4 ? 0 : vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue); + int delta = dcn == 4 ? + vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) : + vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue); bayer += delta; dst += delta*dcn; diff --git a/modules/videoio/CMakeLists.txt b/modules/videoio/CMakeLists.txt index bba3d33396..96ac5045f5 100644 --- a/modules/videoio/CMakeLists.txt +++ b/modules/videoio/CMakeLists.txt @@ -148,7 +148,7 @@ endif(HAVE_INTELPERC) if(IOS) add_definitions(-DHAVE_IOS=1) - list(APPEND videoio_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm) + list(APPEND videoio_srcs src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm) list(APPEND VIDEOIO_LIBRARIES "-framework Accelerate" "-framework AVFoundation" "-framework CoreGraphics" "-framework CoreImage" "-framework CoreMedia" "-framework CoreVideo" "-framework QuartzCore" "-framework AssetsLibrary") endif() From 4255746c0090408ad43d7073ad64bbe0e38d3a1a Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 28 Jul 2014 15:20:25 +0400 Subject: [PATCH 2/4] fixed compile warnings and removed extra whitespaces --- modules/imgproc/src/demosaicing.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 3182c19db2..61a4fe7ed0 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -65,7 +65,7 @@ public: { return 0; } - + int bayer2RGBA(const T*, int, T*, int, int) const { return 0; @@ -223,7 +223,7 @@ public: return (int)(bayer - (bayer_end - width)); } - int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + int bayer2RGBA(const uchar*, int, uchar*, int, int) const { return 0; } @@ -395,7 +395,7 @@ public: vst1_u8(dst, p.val[0]); vst1_u8(dst + 8, p.val[1]); } - + return (int)(bayer - (bayer_end - width)); } @@ -446,7 +446,7 @@ public: return (int)(bayer - (bayer_end - width)); } - + int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const { /* @@ -492,11 +492,11 @@ public: vst4q_u8(dst-1, pix); } - + return (int)(bayer - (bayer_end - width)); } - int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const { return 0; } @@ -737,7 +737,7 @@ public: } // simd optimization only for dcn == 3 - int delta = dcn == 4 ? + int delta = dcn == 4 ? vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) : vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue); bayer += delta; From 11e9e375a3a44330eb29bb6e299d14687c94c7da Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 28 Jul 2014 19:23:46 +0400 Subject: [PATCH 3/4] fixed compile warning with GCC --- modules/imgproc/src/demosaicing.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 61a4fe7ed0..3265545583 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -350,7 +350,7 @@ public: B G B G | B G B G | B G B G | B G B G */ - uint16x8_t masklo = vdupq_n_s16(255); + uint16x8_t masklo = vdupq_n_u16(255); const uchar* bayer_end = bayer + width; for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 ) From 101769d26cbb7dda0b0b69c4c2b40998872fc6f9 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 29 Jul 2014 18:10:18 +0400 Subject: [PATCH 4/4] eliminated some unnecessary instructions --- modules/imgproc/src/demosaicing.cpp | 39 +++++++++++------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 3265545583..ff730ee941 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -359,10 +359,9 @@ public: uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step)); uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2)); - uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo)); - uint16x8_t nextb1 = vextq_u16(b1, b1, 1); - uint16x8_t b0 = vaddq_u16(b1, nextb1); - b1 = vshlq_n_u16(nextb1, 1); + uint16x8_t b1_ = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo)); + uint16x8_t b1 = vextq_u16(b1_, b1_, 1); + uint16x8_t b0 = vaddq_u16(b1_, b1); // b0 = b0 b2 b4 ... // b1 = b1 b3 b5 ... @@ -374,24 +373,24 @@ public: // g1 = b1 b3 b5 ... r0 = vshrq_n_u16(r1, 8); - r1 = vshlq_n_u16(vaddq_u16(r0, vextq_u16(r0, r0, 1)), 1); + r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1)); r0 = vshlq_n_u16(r0, 2); // r0 = r0 r2 r4 ... // r1 = r1 r3 r5 ... b0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b0), (short)(rcoeff*2))); - b1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b1), (short)(rcoeff*2))); + b1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b1), (short)(rcoeff*4))); g0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g0), (short)(gcoeff*2))); g1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g1), (short)(gcoeff*2))); r0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r0), (short)(bcoeff*2))); - r1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r1), (short)(bcoeff*2))); + r1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r1), (short)(bcoeff*4))); - g0 = vshrq_n_u16(vaddq_u16(vaddq_u16(g0, b0), r0), 2); - g1 = vshrq_n_u16(vaddq_u16(vaddq_u16(g1, b1), r1), 2); + g0 = vaddq_u16(vaddq_u16(g0, b0), r0); + g1 = vaddq_u16(vaddq_u16(g1, b1), r1); - uint8x8x2_t p = vzip_u8(vqmovn_u16(g0), vqmovn_u16(g1)); + uint8x8x2_t p = vzip_u8(vrshrn_n_u16(g0, 2), vrshrn_n_u16(g1, 2)); vst1_u8(dst, p.val[0]); vst1_u8(dst + 8, p.val[1]); } @@ -419,26 +418,22 @@ public: uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo)); uint16x8_t nextb1 = vextq_u16(b1, b1, 1); uint16x8_t b0 = vaddq_u16(b1, nextb1); - b1 = vrshrq_n_u16(nextb1, 1); - b0 = vrshrq_n_u16(b0, 2); // b0 b1 b2 ... - uint8x8x2_t bb = vzip_u8(vmovn_u16(b0), vmovn_u16(b1)); + uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1)); pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]); uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8)); uint16x8_t g1 = vandq_u16(r1, masklo); g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1))); g1 = vextq_u16(g1, g1, 1); - g0 = vrshrq_n_u16(g0, 2); // g0 g1 g2 ... - uint8x8x2_t gg = vzip_u8(vmovn_u16(g0), vmovn_u16(g1)); + uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1)); pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]); r0 = vshrq_n_u16(r1, 8); r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1)); - r1 = vrshrq_n_u16(r1, 1); // r0 r1 r2 ... - uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vmovn_u16(r1)); + uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1)); pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]); vst3q_u8(dst-1, pix); @@ -468,26 +463,22 @@ public: uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo)); uint16x8_t nextb1 = vextq_u16(b1, b1, 1); uint16x8_t b0 = vaddq_u16(b1, nextb1); - b1 = vrshrq_n_u16(nextb1, 1); - b0 = vrshrq_n_u16(b0, 2); // b0 b1 b2 ... - uint8x8x2_t bb = vzip_u8(vmovn_u16(b0), vmovn_u16(b1)); + uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1)); pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]); uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8)); uint16x8_t g1 = vandq_u16(r1, masklo); g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1))); g1 = vextq_u16(g1, g1, 1); - g0 = vrshrq_n_u16(g0, 2); // g0 g1 g2 ... - uint8x8x2_t gg = vzip_u8(vmovn_u16(g0), vmovn_u16(g1)); + uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1)); pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]); r0 = vshrq_n_u16(r1, 8); r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1)); - r1 = vrshrq_n_u16(r1, 1); // r0 r1 r2 ... - uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vmovn_u16(r1)); + uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1)); pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]); vst4q_u8(dst-1, pix);