diff --git a/modules/ocl/src/color.cpp b/modules/ocl/src/color.cpp index 0af58643c9..e323934b4c 100644 --- a/modules/ocl/src/color.cpp +++ b/modules/ocl/src/color.cpp @@ -56,8 +56,19 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: { int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + int pixels_per_work_item = 1; - std::string build_options = format("-D DEPTH_%d", src.depth()); + if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + if ((src.cols % 4 == 0) && (src.depth() == CV_8U)) + pixels_per_work_item = 4; + else if (src.cols % 2 == 0) + pixels_per_work_item = 2; + else + pixels_per_work_item = 1; + } + + std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), src.oclchannels(), bidx, pixels_per_work_item); if (!additionalOptions.empty()) build_options += additionalOptions; @@ -66,7 +77,6 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -77,6 +87,73 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: if (!data2.empty()) args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data )); + size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void toHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), + const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat()) +{ + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d", src.depth(), src.oclchannels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data1.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data1.data )); + if (!data2.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data )); + + size_t gt[3] = { dst.cols, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void fromGray_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) +{ + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data )); + size_t gt[3] = { dst.cols, dst.rows, 1 }; #ifdef ANDROID size_t lt[3] = { 16, 10, 1 }; @@ -89,7 +166,50 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) { - std::string build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels()); + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + int pixels_per_work_item = 1; + + if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + if ((src.cols % 4 == 0) && (src.depth() == CV_8U)) + pixels_per_work_item = 4; + else if (src.cols % 2 == 0) + pixels_per_work_item = 2; + else + pixels_per_work_item = 1; + } + + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), dst.channels(), bidx, pixels_per_work_item); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data )); + + size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void toRGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) +{ + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); if (!additionalOptions.empty()) build_options += additionalOptions; @@ -101,7 +221,6 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -119,10 +238,13 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); } -static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) +static void fromHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) { - std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(), - dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER"); + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); @@ -136,6 +258,36 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + if (!data.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data )); + + size_t gt[3] = { dst.cols, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) +{ + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", + src.depth(), dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER"); + + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + size_t gt[3] = { dst.cols, dst.rows, 1 }; #ifdef ANDROID size_t lt[3] = { 16, 10, 1 }; @@ -147,8 +299,8 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName) { - std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d", - src.depth(), greenbits, dst.channels()); + std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d -D bidx=%d", + src.depth(), greenbits, dst.channels(), bidx); int src_offset = src.offset >> 1, src_step = src.step >> 1; int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1(); @@ -157,7 +309,6 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -174,8 +325,8 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName) { - std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d", - src.depth(), greenbits, src.channels()); + std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d -D bidx=%d", + src.depth(), greenbits, src.channels(), bidx); int src_offset = (int)src.offset, src_step = (int)src.step; int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1; @@ -184,7 +335,6 @@ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenb args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -272,7 +422,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) CV_Assert(scn == 1); dcn = code == CV_GRAY2BGRA ? 4 : 3; dst.create(sz, CV_MAKETYPE(depth, dcn)); - toRGB_caller(src, dst, 0, "Gray2RGB"); + fromGray_caller(src, dst, 0, "Gray2RGB"); break; } case CV_BGR2YUV: case CV_RGB2YUV: @@ -303,7 +453,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) Size dstSz(sz.width, sz.height * 2 / 3); dst.create(dstSz, CV_MAKETYPE(depth, dcn)); - toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12"); + toRGB_NV12_caller(src, dst, bidx, "YUV2RGBA_NV12"); break; } case CV_BGR2YCrCb: case CV_RGB2YCrCb: @@ -460,11 +610,11 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) initialized = true; } - fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180); + toHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180); return; } - fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f))); + toHSV_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f))); break; } case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL: @@ -483,7 +633,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) dst.create(sz, CV_MAKETYPE(depth, dcn)); std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB"; - toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange)); + fromHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange)); break; } case CV_RGBA2mRGBA: case CV_mRGBA2RGBA: diff --git a/modules/ocl/src/opencl/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl index bf3b6cfa76..5c236f0e05 100644 --- a/modules/ocl/src/opencl/cvt_color.cl +++ b/modules/ocl/src/opencl/cvt_color.cl @@ -56,35 +56,59 @@ #ifdef DEPTH_0 #define DATA_TYPE uchar +#define VECTOR2 uchar2 +#define VECTOR4 uchar4 +#define VECTOR8 uchar8 +#define VECTOR16 uchar16 #define COEFF_TYPE int #define MAX_NUM 255 #define HALF_MAX 128 #define SAT_CAST(num) convert_uchar_sat_rte(num) +#define SAT_CAST2(num) convert_uchar2_sat(num) +#define SAT_CAST4(num) convert_uchar4_sat(num) #endif #ifdef DEPTH_2 #define DATA_TYPE ushort +#define VECTOR2 ushort2 +#define VECTOR4 ushort4 +#define VECTOR8 ushort8 +#define VECTOR16 ushort16 #define COEFF_TYPE int #define MAX_NUM 65535 #define HALF_MAX 32768 #define SAT_CAST(num) convert_ushort_sat_rte(num) +#define SAT_CAST2(num) convert_ushort2_sat(num) +#define SAT_CAST4(num) convert_ushort4_sat(num) #endif #ifdef DEPTH_5 #define DATA_TYPE float +#define VECTOR2 float2 +#define VECTOR4 float4 +#define VECTOR8 float8 +#define VECTOR16 float16 #define COEFF_TYPE float #define MAX_NUM 1.0f #define HALF_MAX 0.5f #define SAT_CAST(num) (num) #endif +#ifndef bidx + #define bidx 0 +#endif + +#ifndef pixels_per_work_item + #define pixels_per_work_item 1 +#endif + #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) enum { yuv_shift = 14, xyz_shift = 12, - hsv_shift = 12, + hsv_shift = 12, R2Y = 4899, G2Y = 9617, B2Y = 1868, @@ -93,26 +117,87 @@ enum ///////////////////////////////////// RGB <-> GRAY ////////////////////////////////////// +__constant float c_RGB2GrayCoeffs_f[3] = { 0.114f, 0.587f, 0.299f }; +__constant int c_RGB2GrayCoeffs_i[3] = { B2Y, G2Y, R2Y }; + __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) { int src_idx = mad24(y, src_step, src_offset + (x << 2)); int dst_idx = mad24(y, dst_step, dst_offset + x); + +#ifndef INTEL_DEVICE + #ifdef DEPTH_5 dst[dst_idx] = src[src_idx + bidx] * 0.114f + src[src_idx + 1] * 0.587f + src[src_idx + (bidx^2)] * 0.299f; #else dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift); #endif + +#else //INTEL_DEVICE + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#ifdef DEPTH_5 + __constant float * coeffs = c_RGB2GrayCoeffs_f; +#else + __constant int * coeffs = c_RGB2GrayCoeffs_i; +#endif + +#if (1 == pixels_per_work_item) + { +#ifdef DEPTH_5 + *dst_ptr = src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] *coeffs[2]; +#else + *dst_ptr = (DATA_TYPE)CV_DESCALE((src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] * coeffs[2]), yuv_shift); +#endif + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2]; +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift); + const VECTOR2 Y = SAT_CAST2(yi); +#endif + + vstore2(Y, 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + const int4 Y = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift); + + vstore4(SAT_CAST4(Y), 0, dst_ptr); +#endif + } +#endif //pixels_per_work_item +#endif //INTEL_DEVICE } } -__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { @@ -140,10 +225,10 @@ __constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877 __constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 }; __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -151,24 +236,84 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, x <<= 2; int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_RGB2YUVCoeffs_f; - DATA_TYPE Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; - DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; - DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; #else __constant int * coeffs = c_RGB2YUVCoeffs_i; - int delta = HALF_MAX * (1 << yuv_shift); - int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); - int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); - int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); + const int delta = HALF_MAX * (1 << yuv_shift); #endif - dst[dst_idx] = SAT_CAST( Y ); - dst[dst_idx + 1] = SAT_CAST( Cr ); - dst[dst_idx + 2] = SAT_CAST( Cb ); +#if (1 == pixels_per_work_item) + { + const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; + float U = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; + float V = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; +#else + int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); + int U = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); + int V = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); +#endif + + dst_ptr[0] = SAT_CAST( Y ); + dst_ptr[1] = SAT_CAST( U ); + dst_ptr[2] = SAT_CAST( V ); + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]); + const float2 U = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX); + const float2 V = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX); +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 U = SAT_CAST2(ui); + const VECTOR2 V = SAT_CAST2(vi); +#endif + + vstore8((VECTOR8)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + + const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 U = SAT_CAST4(ui); + const VECTOR4 V = SAT_CAST4(vi); + + vstore16((VECTOR16)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0, Y.s2, U.s2, V.s2, 0, Y.s3, U.s3, V.s3, 0), 0, dst_ptr); +#endif + } +#endif //pixels_per_work_item } } @@ -176,10 +321,10 @@ __constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f }; __constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -187,26 +332,94 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, x <<= 2; int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE yuv[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_YUV2RGBCoeffs_f; - float b = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3]; - float g = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1]; - float r = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0]; #else __constant int * coeffs = c_YUV2RGBCoeffs_i; - int b = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift); - int g = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift); - int r = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift); #endif - dst[dst_idx + bidx] = SAT_CAST( b ); - dst[dst_idx + 1] = SAT_CAST( g ); - dst[dst_idx + (bidx^2)] = SAT_CAST( r ); -#if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; +#if (1 == pixels_per_work_item) + { + const DATA_TYPE yuv[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float B = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3]; + float G = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1]; + float R = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0]; +#else + int B = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift); + int G = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift); + int R = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift); #endif + + dst_ptr[bidx] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[(bidx^2)] = SAT_CAST( R ); +#if dcn == 4 + dst_ptr[3] = MAX_NUM; +#endif + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 Y = r0.s04; + const float2 U = r0.s15; + const float2 V = r0.s26; + + const float2 c0 = (bidx == 0) ? (Y + (V - HALF_MAX) * coeffs[3]) : (Y + (U - HALF_MAX) * coeffs[0]); + const float2 c1 = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; + const float2 c2 = (bidx == 0) ? (Y + (U - HALF_MAX) * coeffs[0]) : (Y + (V - HALF_MAX) * coeffs[3]); +#else + const int2 Y = convert_int2(r0.s04); + const int2 U = convert_int2(r0.s15); + const int2 V = convert_int2(r0.s26); + + const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)); + const int2 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR2 c0 = SAT_CAST2(c0i); + const VECTOR2 c1 = SAT_CAST2(c1i); + const VECTOR2 c2 = SAT_CAST2(c2i); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr); +#endif + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 Y = convert_int4(r0.s048c); + const int4 U = convert_int4(r0.s159d); + const int4 V = convert_int4(r0.s26ae); + + const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)); + const int4 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR4 c0 = SAT_CAST4(c0i); + const VECTOR4 c1 = SAT_CAST4(c1i); + const VECTOR4 c2 = SAT_CAST4(c2i); + +#if dcn == 4 + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr); +#endif +#endif + } +#endif //pixels_per_work_item } } @@ -218,7 +431,7 @@ __constant int ITUR_BT_601_CVR = 1673527; __constant int ITUR_BT_601_SHIFT = 20; __kernel void YUV2RGBA_NV12(int cols, int rows, int src_step, int dst_step, - int bidx, __global const uchar* src, __global uchar* dst, + __global const uchar* src, __global uchar* dst, int src_offset, int dst_offset) { const int x = get_global_id(0); @@ -275,10 +488,10 @@ __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564 __constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241}; __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, - int src_offset, int dst_offset) + __global const DATA_TYPE* src, __global DATA_TYPE* dst, + int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -287,24 +500,82 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_RGB2YCrCbCoeffs_f; - DATA_TYPE Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; - DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; - DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; #else __constant int * coeffs = c_RGB2YCrCbCoeffs_i; - int delta = HALF_MAX * (1 << yuv_shift); - int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); - int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); - int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); + const int delta = HALF_MAX * (1 << yuv_shift); #endif - dst[dst_idx] = SAT_CAST( Y ); - dst[dst_idx + 1] = SAT_CAST( Cr ); - dst[dst_idx + 2] = SAT_CAST( Cb ); +#if (1 == pixels_per_work_item) + { + const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; + float Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; + float Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; +#else + int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); + int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); + int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); +#endif + + dst_ptr[0] = SAT_CAST( Y ); + dst_ptr[1] = SAT_CAST( Cr ); + dst_ptr[2] = SAT_CAST( Cb ); + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]); + const float2 Cr = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX); + const float2 Cb = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX); +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 Cr = SAT_CAST2(ui); + const VECTOR2 Cb = SAT_CAST2(vi); +#endif + + vstore8((VECTOR8)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + + const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 Cr = SAT_CAST4(ui); + const VECTOR4 Cb = SAT_CAST4(vi); + + vstore16((VECTOR16)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0, Y.s2, Cr.s2, Cb.s2, 0, Y.s3, Cr.s3, Cb.s3, 0), 0, dst_ptr); +#endif + } +#endif //pixels_per_work_item } } @@ -312,10 +583,10 @@ __constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f }; __constant int c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 }; __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, - int src_offset, int dst_offset) + __global const DATA_TYPE* src, __global DATA_TYPE* dst, + int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -324,36 +595,103 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE ycrcb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 - __constant float * coeff = c_YCrCb2RGBCoeffs_f; - float r = ycrcb[0] + coeff[0] * (ycrcb[1] - HALF_MAX); - float g = ycrcb[0] + coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX); - float b = ycrcb[0] + coeff[3] * (ycrcb[2] - HALF_MAX); + __constant float * coeffs = c_YCrCb2RGBCoeffs_f; #else - __constant int * coeff = c_YCrCb2RGBCoeffs_i; - int r = ycrcb[0] + CV_DESCALE(coeff[0] * (ycrcb[1] - HALF_MAX), yuv_shift); - int g = ycrcb[0] + CV_DESCALE(coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX), yuv_shift); - int b = ycrcb[0] + CV_DESCALE(coeff[3] * (ycrcb[2] - HALF_MAX), yuv_shift); + __constant int * coeffs = c_YCrCb2RGBCoeffs_i; #endif - dst[dst_idx + (bidx^2)] = SAT_CAST(r); - dst[dst_idx + 1] = SAT_CAST(g); - dst[dst_idx + bidx] = SAT_CAST(b); -#if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; +#if (1 == pixels_per_work_item) + { + const DATA_TYPE ycrcb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float B = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[3]; + float G = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1]; + float R = ycrcb[0] + (ycrcb[1] - HALF_MAX) * coeffs[0]; +#else + int B = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[3], yuv_shift); + int G = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1], yuv_shift); + int R = ycrcb[0] + CV_DESCALE((ycrcb[1] - HALF_MAX) * coeffs[0], yuv_shift); #endif + + dst_ptr[bidx] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[(bidx^2)] = SAT_CAST( R ); +#if dcn == 4 + dst_ptr[3] = MAX_NUM; +#endif + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 Y = r0.s04; + const float2 Cr = r0.s15; + const float2 Cb = r0.s26; + + const float2 c0 = (bidx == 0) ? (Y + (Cb - HALF_MAX) * coeffs[3]) : (Y + (Cr - HALF_MAX) * coeffs[0]); + const float2 c1 = Y + (Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1]; + const float2 c2 = (bidx == 0) ? (Y + (Cr - HALF_MAX) * coeffs[0]) : (Y + (Cb - HALF_MAX) * coeffs[3]); +#else + const int2 Y = convert_int2(r0.s04); + const int2 Cr = convert_int2(r0.s15); + const int2 Cb = convert_int2(r0.s26); + + const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)); + const int2 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift); + const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR2 c0 = SAT_CAST2(c0i); + const VECTOR2 c1 = SAT_CAST2(c1i); + const VECTOR2 c2 = SAT_CAST2(c2i); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr); +#endif + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 Y = convert_int4(r0.s048c); + const int4 Cr = convert_int4(r0.s159d); + const int4 Cb = convert_int4(r0.s26ae); + + const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)); + const int4 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift); + const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR4 c0 = SAT_CAST4(c0i); + const VECTOR4 c1 = SAT_CAST4(c1i); + const VECTOR4 c2 = SAT_CAST4(c2i); + +#if dcn == 4 + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr); +#endif +#endif + } +#endif //pixels_per_work_item } } ///////////////////////////////////// RGB <-> XYZ ////////////////////////////////////// __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs) { - int dx = get_global_id(0); + int dx = get_global_id(0) * pixels_per_work_item; int dy = get_global_id(1); if (dy < rows && dx < cols) @@ -362,28 +700,84 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(dy, src_step, src_offset + dx); int dst_idx = mad24(dy, dst_step, dst_offset + dx); - DATA_TYPE r = src[src_idx], g = src[src_idx + 1], b = src[src_idx + 2]; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#if (1 == pixels_per_work_item) + { + DATA_TYPE R = src_ptr[0], G = src_ptr[1], B = src_ptr[2]; #ifdef DEPTH_5 - float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2]; - float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5]; - float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8]; + float X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2]; + float Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5]; + float Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8]; #else - int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift); - int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift); - int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift); + int X = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + int Y = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + int Z = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); #endif - dst[dst_idx] = SAT_CAST(x); - dst[dst_idx + 1] = SAT_CAST(y); - dst[dst_idx + 2] = SAT_CAST(z); + + dst_ptr[0] = SAT_CAST( X ); + dst_ptr[1] = SAT_CAST( Y ); + dst_ptr[2] = SAT_CAST( Z ); + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 R = r0.s04; + const float2 G = r0.s15; + const float2 B = r0.s26; + + const float2 X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2]; + const float2 Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5]; + const float2 Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8]; +#else + const int2 R = convert_int2(r0.s04); + const int2 G = convert_int2(r0.s15); + const int2 B = convert_int2(r0.s26); + + const int2 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + const int2 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + const int2 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); + + const VECTOR2 X = SAT_CAST2(xi); + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 Z = SAT_CAST2(zi); +#endif + + vstore8((VECTOR8)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 R = convert_int4(r0.s048c); + const int4 G = convert_int4(r0.s159d); + const int4 B = convert_int4(r0.s26ae); + + const int4 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + const int4 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + const int4 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); + + const VECTOR4 X = SAT_CAST4(xi); + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 Z = SAT_CAST4(zi); + + vstore16((VECTOR16)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0, X.s2, Y.s2, Z.s2, 0, X.s3, Y.s3, Z.s3, 0), 0, dst_ptr); +#endif + } +#endif //pixels_per_work_item } } __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs) { - int dx = get_global_id(0); + int dx = get_global_id(0) * pixels_per_work_item; int dy = get_global_id(1); if (dy < rows && dx < cols) @@ -392,23 +786,87 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(dy, src_step, src_offset + dx); int dst_idx = mad24(dy, dst_step, dst_offset + dx); - DATA_TYPE x = src[src_idx], y = src[src_idx + 1], z = src[src_idx + 2]; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#if (1 == pixels_per_work_item) + { + const DATA_TYPE X = src_ptr[0], Y = src_ptr[1], Z = src_ptr[2]; #ifdef DEPTH_5 - float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2]; - float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5]; - float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8]; + float B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2]; + float G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5]; + float R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8]; #else - int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift); - int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift); - int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift); + int B = CV_DESCALE(X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2], xyz_shift); + int G = CV_DESCALE(X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5], xyz_shift); + int R = CV_DESCALE(X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8], xyz_shift); #endif - dst[dst_idx] = SAT_CAST(b); - dst[dst_idx + 1] = SAT_CAST(g); - dst[dst_idx + 2] = SAT_CAST(r); + + dst_ptr[0] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[2] = SAT_CAST( R ); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst_ptr[3] = MAX_NUM; #endif + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 X = r0.s04; + const float2 Y = r0.s15; + const float2 Z = r0.s26; + + float2 B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2]; + float2 G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5]; + float2 R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8]; +#else + const int2 xi = convert_int2(r0.s04); + const int2 yi = convert_int2(r0.s15); + const int2 zi = convert_int2(r0.s26); + + const int2 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift); + const int2 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift); + const int2 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift); + + const VECTOR2 R = SAT_CAST2(ri); + const VECTOR2 G = SAT_CAST2(gi); + const VECTOR2 B = SAT_CAST2(bi); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0), 0, dst_ptr); +#endif + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 xi = convert_int4(r0.s048c); + const int4 yi = convert_int4(r0.s159d); + const int4 zi = convert_int4(r0.s26ae); + + const int4 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift); + const int4 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift); + const int4 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift); + + const VECTOR4 R = SAT_CAST4(ri); + const VECTOR4 G = SAT_CAST4(gi); + const VECTOR4 B = SAT_CAST4(bi); + +#if dcn == 4 + vstore16((VECTOR16)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM, B.s2, G.s2, R.s2, MAX_NUM, B.s3, G.s3, R.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0, B.s2, G.s2, R.s2, 0, B.s3, G.s3, R.s3, 0), 0, dst_ptr); +#endif +#endif + } +#endif // pixels_per_work_item } } @@ -427,6 +885,7 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); +#ifndef INTEL_DEVICE #ifdef REVERSE dst[dst_idx] = src[src_idx + 2]; dst[dst_idx + 1] = src[src_idx + 1]; @@ -444,12 +903,43 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step, dst[dst_idx + 3] = src[src_idx + 3]; #endif #endif +#else //INTEL_DEVICE + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + + const VECTOR4 r0 = vload4(0, src_ptr); +#ifdef REVERSE + if (3 == dcn) + { + vstore4((VECTOR4)(r0.s210, 0), 0, dst_ptr); + } + else if (3 == scn) + { + vstore4((VECTOR4)(r0.s210, MAX_NUM), 0, dst_ptr); + } + else { + vstore4((VECTOR4)(r0.s2103), 0, dst_ptr); + } +#elif defined ORDER + if (3 == dcn) + { + vstore4((VECTOR4)(r0.s012, 0), 0, dst_ptr); + } + else if (3 == scn) + { + vstore4((VECTOR4)(r0.s012, MAX_NUM), 0, dst_ptr); + } + else { + vstore4(r0, 0, dst_ptr); + } +#endif +#endif //INTEL_DEVICE } } ///////////////////////////////////// RGB5x5 <-> RGB ////////////////////////////////////// -__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, __global const ushort * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -482,7 +972,7 @@ __kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bid } } -__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global ushort * dst, int src_offset, int dst_offset) { @@ -507,7 +997,7 @@ __kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bid ///////////////////////////////////// RGB5x5 <-> RGB ////////////////////////////////////// -__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, __global const ushort * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -532,7 +1022,7 @@ __kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bi } } -__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global ushort * dst, int src_offset, int dst_offset) { @@ -560,7 +1050,7 @@ __constant int sector_data[][3] = { {1, 3, 0}, { 1, 0, 2 }, { 3, 0, 1 }, { 0, 2, #ifdef DEPTH_0 -__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset, __constant int * sdiv_table, __constant int * hdiv_table) @@ -600,7 +1090,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -656,7 +1146,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #elif defined DEPTH_5 -__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -698,7 +1188,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -758,7 +1248,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #ifdef DEPTH_0 -__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -805,7 +1295,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -860,7 +1350,7 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #elif defined DEPTH_5 -__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -907,7 +1397,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -968,33 +1458,10 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #ifdef DEPTH_0 __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step, - int bidx, __global const uchar * src, __global uchar * dst, - int src_offset, int dst_offset) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (y < rows && x < cols) - { - x <<= 2; - int src_idx = mad24(y, src_step, src_offset + x); - int dst_idx = mad24(y, dst_step, dst_offset + x); - - uchar v0 = src[src_idx], v1 = src[src_idx + 1]; - uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; - - dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 3] = v3; - } -} - -__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bidx, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -1003,14 +1470,129 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bid int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - uchar v0 = src[src_idx], v1 = src[src_idx + 1]; - uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; - uchar v3_half = v3 / 2; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); - dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 3] = v3; +#if (1 == pixels_per_work_item) + { + const uchar4 r0 = vload4(0, src_ptr); + + dst_ptr[0] = (r0.s0 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[1] = (r0.s1 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[2] = (r0.s2 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[3] = r0.s3; + } +#elif (2 == pixels_per_work_item) + { + const uchar8 r0 = vload8(0, src_ptr); + + const int2 v0 = convert_int2(r0.s04); + const int2 v1 = convert_int2(r0.s15); + const int2 v2 = convert_int2(r0.s26); + const int2 v3 = convert_int2(r0.s37); + + const int2 ri = (v0 * v3 + HALF_MAX) / MAX_NUM; + const int2 gi = (v1 * v3 + HALF_MAX) / MAX_NUM; + const int2 bi = (v2 * v3 + HALF_MAX) / MAX_NUM; + + const uchar2 r = convert_uchar2(ri); + const uchar2 g = convert_uchar2(gi); + const uchar2 b = convert_uchar2(bi); + + vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { + const uchar16 r0 = vload16(0, src_ptr); + + const int4 v0 = convert_int4(r0.s048c); + const int4 v1 = convert_int4(r0.s159d); + const int4 v2 = convert_int4(r0.s26ae); + const int4 v3 = convert_int4(r0.s37bf); + + const int4 ri = (v0 * v3 + HALF_MAX) / MAX_NUM; + const int4 gi = (v1 * v3 + HALF_MAX) / MAX_NUM; + const int4 bi = (v2 * v3 + HALF_MAX) / MAX_NUM; + + const uchar4 r = convert_uchar4(ri); + const uchar4 g = convert_uchar4(gi); + const uchar4 b = convert_uchar4(bi); + + vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr); + } +#endif // pixels_per_work_item + } +} + +__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, + __global const uchar * src, __global uchar * dst, + int src_offset, int dst_offset) +{ + int x = get_global_id(0) * pixels_per_work_item; + int y = get_global_id(1); + + if (y < rows && x < cols) + { + x <<= 2; + int src_idx = mad24(y, src_step, src_offset + x); + int dst_idx = mad24(y, dst_step, dst_offset + x); + + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#if (1 == pixels_per_work_item) + { + const uchar4 r0 = vload4(0, src_ptr); + const uchar v3_half = r0.s3 / 2; + + const uchar r = (r0.s3 == 0) ? 0 : (r0.s0 * MAX_NUM + v3_half) / r0.s3; + const uchar g = (r0.s3 == 0) ? 0 : (r0.s1 * MAX_NUM + v3_half) / r0.s3; + const uchar b = (r0.s3 == 0) ? 0 : (r0.s2 * MAX_NUM + v3_half) / r0.s3; + + vstore4((uchar4)(r, g, b, r0.s3), 0, dst_ptr); + } +#elif (2 == pixels_per_work_item) + { + const uchar8 r0 = vload8(0, src_ptr); + + const int2 v0 = convert_int2(r0.s04); + const int2 v1 = convert_int2(r0.s15); + const int2 v2 = convert_int2(r0.s26); + const int2 v3 = convert_int2(r0.s37); + const int2 v3_half = v3 / 2; + + const int2 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3; + const int2 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3; + const int2 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3; + + const uchar2 r = convert_uchar2(ri); + const uchar2 g = convert_uchar2(gi); + const uchar2 b = convert_uchar2(bi); + + vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { + const uchar16 r0 = vload16(0, src_ptr); + + const int4 v0 = convert_int4(r0.s048c); + const int4 v1 = convert_int4(r0.s159d); + const int4 v2 = convert_int4(r0.s26ae); + const int4 v3 = convert_int4(r0.s37bf); + const int4 v3_half = v3 / 2; + + + const int4 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3; + const int4 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3; + const int4 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3; + + const uchar4 r = convert_uchar4(ri); + const uchar4 g = convert_uchar4(gi); + const uchar4 b = convert_uchar4(bi); + + vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr); + } +#endif // pixels_per_work_item } }