mirror of
https://github.com/opencv/opencv.git
synced 2024-12-29 12:38:19 +08:00
1223 lines
64 KiB
Common Lisp
1223 lines
64 KiB
Common Lisp
/*M///////////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
|
//
|
|
// By downloading, copying, installing or using the software you agree to this license.
|
|
// If you do not agree to this license, do not download, install,
|
|
// copy or use the software.
|
|
//
|
|
//
|
|
// License Agreement
|
|
// For Open Source Computer Vision Library
|
|
//
|
|
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
|
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
|
// Third party copyrights are property of their respective owners.
|
|
//
|
|
// @Authors
|
|
// Jia Haipeng, jiahaipeng95@gmail.com
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without modification,
|
|
// are permitted provided that the following conditions are met:
|
|
//
|
|
// * Redistribution's of source code must retain the above copyright notice,
|
|
// this list of conditions and the following disclaimer.
|
|
//
|
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
// and/or other oclMaterials provided with the distribution.
|
|
//
|
|
// * The name of the copyright holders may not be used to endorse or promote products
|
|
// derived from this software without specific prior written permission.
|
|
//
|
|
// This software is provided by the copyright holders and contributors as is and
|
|
// any express or implied warranties, including, but not limited to, the implied
|
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
|
// indirect, incidental, special, exemplary, or consequential damages
|
|
// (including, but not limited to, procurement of substitute goods or services;
|
|
// loss of use, data, or profits; or business interruption) however caused
|
|
// and on any theory of liability, whether in contract, strict liability,
|
|
// or tort (including negligence or otherwise) arising in any way out of
|
|
// the use of this software, even if advised of the possibility of such damage.
|
|
//
|
|
//M*/
|
|
#if defined (DOUBLE_SUPPORT)
|
|
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
|
#endif
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////
|
|
//////////////////////////////////optimized code using vector ////////////////////////////////
|
|
////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
|
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
|
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
|
__global uchar *mat_dst3, int dst3_step, int dst3_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 2;
|
|
|
|
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
|
|
|
|
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
|
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
|
|
|
|
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
|
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
|
int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
|
|
|
|
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
|
|
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
|
|
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
|
|
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
|
|
|
|
int total_bytes = src_offset + rows * src_step;
|
|
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
|
|
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
|
|
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
|
|
|
|
uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
|
|
|
|
if((dst0_offset & 3) == 3)
|
|
tmp_data0 = (uchar4)(data_0.x, data_1.x, data_2.x, data_3.x);
|
|
if((dst0_offset & 3) == 2)
|
|
tmp_data0 = (uchar4)(data_1.x, data_2.x, data_3.x, data_4.x);
|
|
if((dst0_offset & 3) == 1)
|
|
tmp_data0 = (uchar4)(data_2.x, data_3.x, data_4.x, data_5.x);
|
|
if((dst0_offset & 3) == 0)
|
|
tmp_data0 = (uchar4)(data_3.x, data_4.x, data_5.x, data_6.x);
|
|
|
|
if((dst1_offset & 3) == 3)
|
|
tmp_data1 = (uchar4)(data_0.y, data_1.y, data_2.y, data_3.y);
|
|
if((dst1_offset & 3) == 2)
|
|
tmp_data1 = (uchar4)(data_1.y, data_2.y, data_3.y, data_4.y);
|
|
if((dst1_offset & 3) == 1)
|
|
tmp_data1 = (uchar4)(data_2.y, data_3.y, data_4.y, data_5.y);
|
|
if((dst1_offset & 3) == 0)
|
|
tmp_data1 = (uchar4)(data_3.y, data_4.y, data_5.y, data_6.y);
|
|
|
|
if((dst2_offset & 3) == 3)
|
|
tmp_data2 = (uchar4)(data_0.z, data_1.z, data_2.z, data_3.z);
|
|
if((dst2_offset & 3) == 2)
|
|
tmp_data2 = (uchar4)(data_1.z, data_2.z, data_3.z, data_4.z);
|
|
if((dst2_offset & 3) == 1)
|
|
tmp_data2 = (uchar4)(data_2.z, data_3.z, data_4.z, data_5.z);
|
|
if((dst2_offset & 3) == 0)
|
|
tmp_data2 = (uchar4)(data_3.z, data_4.z, data_5.z, data_6.z);
|
|
|
|
if((dst3_offset & 3) == 3)
|
|
tmp_data3 = (uchar4)(data_0.w, data_1.w, data_2.w, data_3.w);
|
|
if((dst3_offset & 3) == 2)
|
|
tmp_data3 = (uchar4)(data_1.w, data_2.w, data_3.w, data_4.w);
|
|
if((dst3_offset & 3) == 1)
|
|
tmp_data3 = (uchar4)(data_2.w, data_3.w, data_4.w, data_5.w);
|
|
if((dst3_offset & 3) == 0)
|
|
tmp_data3 = (uchar4)(data_3.w, data_4.w, data_5.w, data_6.w);
|
|
|
|
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
|
|
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
|
|
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
|
|
uchar4 dst3_data = *((__global uchar4 *)(mat_dst3 + dst3_idx));
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
|
|
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
|
|
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
|
|
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
|
|
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
|
|
|
|
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
|
|
tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
|
|
tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
|
|
tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
|
|
|
|
tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
|
|
tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
|
|
tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
|
|
tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
|
|
|
|
*((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
|
|
*((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
|
|
*((__global uchar4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
|
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 2;
|
|
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
|
|
|
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
|
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
|
|
|
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
|
|
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
|
|
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
|
|
|
|
uchar4 tmp_data0, tmp_data1, tmp_data2;
|
|
|
|
uchar src_data_0 = *(mat_src + src_idx + 3 * x - 9);
|
|
uchar src_data_1 = *(mat_src + src_idx + 3 * x - 8);
|
|
uchar src_data_2 = *(mat_src + src_idx + 3 * x - 7);
|
|
|
|
uchar src_data_3 = *(mat_src + src_idx + 3 * x - 6);
|
|
uchar src_data_4 = *(mat_src + src_idx + 3 * x - 5);
|
|
uchar src_data_5 = *(mat_src + src_idx + 3 * x - 4);
|
|
|
|
uchar src_data_6 = *(mat_src + src_idx + 3 * x - 3);
|
|
uchar src_data_7 = *(mat_src + src_idx + 3 * x - 2);
|
|
uchar src_data_8 = *(mat_src + src_idx + 3 * x - 1);
|
|
|
|
uchar src_data_9 = *(mat_src + src_idx + 3 * x + 0);
|
|
uchar src_data_10 = *(mat_src + src_idx + 3 * x + 1);
|
|
uchar src_data_11 = *(mat_src + src_idx + 3 * x + 2);
|
|
|
|
uchar src_data_12 = *(mat_src + src_idx + 3 * x + 3);
|
|
uchar src_data_13 = *(mat_src + src_idx + 3 * x + 4);
|
|
uchar src_data_14 = *(mat_src + src_idx + 3 * x + 5);
|
|
|
|
uchar src_data_15 = *(mat_src + src_idx + 3 * x + 6);
|
|
uchar src_data_16 = *(mat_src + src_idx + 3 * x + 7);
|
|
uchar src_data_17 = *(mat_src + src_idx + 3 * x + 8);
|
|
|
|
uchar src_data_18 = *(mat_src + src_idx + 3 * x + 9);
|
|
uchar src_data_19 = *(mat_src + src_idx + 3 * x + 10);
|
|
uchar src_data_20 = *(mat_src + src_idx + 3 * x + 11);
|
|
|
|
uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
|
|
int index = 3 - dst0_offset & 3;
|
|
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
|
|
|
uchar4 data0, data1, data2;
|
|
|
|
data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
|
|
data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
|
|
data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
|
|
tmp_data1 = (dst1_offset & 3) == 0 ? (uchar4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
|
|
|
|
data0 = (uchar4)(src_data_2, src_data_5, src_data_8, src_data_11);
|
|
data1 = (dst2_offset & 3) == 2 ? (uchar4)(src_data_5, src_data_8, src_data_11, src_data_14) : data0;
|
|
data2 = (dst2_offset & 3) == 1 ? (uchar4)(src_data_8, src_data_11, src_data_14, src_data_17) : data1;
|
|
tmp_data2 = (dst2_offset & 3) == 0 ? (uchar4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
|
|
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
|
|
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
|
|
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
|
|
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
|
|
|
|
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
|
|
tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
|
|
tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
|
|
tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
|
|
|
|
*((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
|
|
*((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
|
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 2;
|
|
|
|
#define dst0_align ((dst0_offset & 3) << 1)
|
|
#define dst1_align ((dst1_offset & 3) << 1)
|
|
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
|
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
|
|
|
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
|
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
|
uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
|
|
uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
|
|
if(src_idx_0 == -6)
|
|
src_data_0.s01234567 = src_data_0.s67012345;
|
|
if(src_idx_0 == -4)
|
|
src_data_0.s01234567 = src_data_0.s45670123;
|
|
if(src_idx_0 == -2)
|
|
src_data_0.s01234567 = src_data_0.s23456701;
|
|
if(src_idx_1 == -6)
|
|
src_data_1.s01234567 = src_data_1.s67012345;
|
|
if(src_idx_1 == -4)
|
|
src_data_1.s01234567 = src_data_1.s45670123;
|
|
if(src_idx_1 == -2)
|
|
src_data_1.s01234567 = src_data_1.s23456701;
|
|
|
|
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
|
|
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
|
|
|
|
uchar4 tmp_data0, tmp_data1;
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
|
|
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
|
|
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
|
|
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
|
|
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
|
|
|
|
*((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
|
|
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
|
__global char *mat_dst3, int dst3_step, int dst3_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 2;
|
|
|
|
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
|
|
|
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
|
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
|
|
|
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
|
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
|
int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
|
|
|
|
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
|
|
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
|
|
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
|
|
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
|
|
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
|
|
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
|
|
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
|
|
|
|
char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
|
|
|
|
if((dst0_offset & 3) == 3)
|
|
tmp_data0 = (char4)(data_0.x, data_1.x, data_2.x, data_3.x);
|
|
if((dst0_offset & 3) == 2)
|
|
tmp_data0 = (char4)(data_1.x, data_2.x, data_3.x, data_4.x);
|
|
if((dst0_offset & 3) == 1)
|
|
tmp_data0 = (char4)(data_2.x, data_3.x, data_4.x, data_5.x);
|
|
if((dst0_offset & 3) == 0)
|
|
tmp_data0 = (char4)(data_3.x, data_4.x, data_5.x, data_6.x);
|
|
|
|
if((dst1_offset & 3) == 3)
|
|
tmp_data1 = (char4)(data_0.y, data_1.y, data_2.y, data_3.y);
|
|
if((dst1_offset & 3) == 2)
|
|
tmp_data1 = (char4)(data_1.y, data_2.y, data_3.y, data_4.y);
|
|
if((dst1_offset & 3) == 1)
|
|
tmp_data1 = (char4)(data_2.y, data_3.y, data_4.y, data_5.y);
|
|
if((dst1_offset & 3) == 0)
|
|
tmp_data1 = (char4)(data_3.y, data_4.y, data_5.y, data_6.y);
|
|
|
|
if((dst2_offset & 3) == 3)
|
|
tmp_data2 = (char4)(data_0.z, data_1.z, data_2.z, data_3.z);
|
|
if((dst2_offset & 3) == 2)
|
|
tmp_data2 = (char4)(data_1.z, data_2.z, data_3.z, data_4.z);
|
|
if((dst2_offset & 3) == 1)
|
|
tmp_data2 = (char4)(data_2.z, data_3.z, data_4.z, data_5.z);
|
|
if((dst2_offset & 3) == 0)
|
|
tmp_data2 = (char4)(data_3.z, data_4.z, data_5.z, data_6.z);
|
|
|
|
if((dst3_offset & 3) == 3)
|
|
tmp_data3 = (char4)(data_0.w, data_1.w, data_2.w, data_3.w);
|
|
if((dst3_offset & 3) == 2)
|
|
tmp_data3 = (char4)(data_1.w, data_2.w, data_3.w, data_4.w);
|
|
if((dst3_offset & 3) == 1)
|
|
tmp_data3 = (char4)(data_2.w, data_3.w, data_4.w, data_5.w);
|
|
if((dst3_offset & 3) == 0)
|
|
tmp_data3 = (char4)(data_3.w, data_4.w, data_5.w, data_6.w);
|
|
|
|
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
|
|
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
|
|
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
|
|
char4 dst3_data = *((__global char4 *)(mat_dst3 + dst3_idx));
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
|
|
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
|
|
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
|
|
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
|
|
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
|
|
|
|
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
|
|
tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
|
|
tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
|
|
tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
|
|
|
|
tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
|
|
tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
|
|
tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
|
|
tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
|
|
|
|
*((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
|
|
*((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
|
|
*((__global char4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
|
|
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 2;
|
|
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
|
|
|
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
|
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
|
|
|
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
|
|
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
|
|
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
|
|
|
|
char4 tmp_data0, tmp_data1, tmp_data2;
|
|
|
|
char src_data_0 = *(mat_src + src_idx + 3 * x - 9);
|
|
char src_data_1 = *(mat_src + src_idx + 3 * x - 8);
|
|
char src_data_2 = *(mat_src + src_idx + 3 * x - 7);
|
|
|
|
char src_data_3 = *(mat_src + src_idx + 3 * x - 6);
|
|
char src_data_4 = *(mat_src + src_idx + 3 * x - 5);
|
|
char src_data_5 = *(mat_src + src_idx + 3 * x - 4);
|
|
|
|
char src_data_6 = *(mat_src + src_idx + 3 * x - 3);
|
|
char src_data_7 = *(mat_src + src_idx + 3 * x - 2);
|
|
char src_data_8 = *(mat_src + src_idx + 3 * x - 1);
|
|
|
|
char src_data_9 = *(mat_src + src_idx + 3 * x + 0);
|
|
char src_data_10 = *(mat_src + src_idx + 3 * x + 1);
|
|
char src_data_11 = *(mat_src + src_idx + 3 * x + 2);
|
|
|
|
char src_data_12 = *(mat_src + src_idx + 3 * x + 3);
|
|
char src_data_13 = *(mat_src + src_idx + 3 * x + 4);
|
|
char src_data_14 = *(mat_src + src_idx + 3 * x + 5);
|
|
|
|
char src_data_15 = *(mat_src + src_idx + 3 * x + 6);
|
|
char src_data_16 = *(mat_src + src_idx + 3 * x + 7);
|
|
char src_data_17 = *(mat_src + src_idx + 3 * x + 8);
|
|
|
|
char src_data_18 = *(mat_src + src_idx + 3 * x + 9);
|
|
char src_data_19 = *(mat_src + src_idx + 3 * x + 10);
|
|
char src_data_20 = *(mat_src + src_idx + 3 * x + 11);
|
|
|
|
char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
|
|
int index = 3 - dst0_offset & 3;
|
|
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
|
|
|
char4 data0, data1, data2;
|
|
|
|
data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
|
|
data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
|
|
data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
|
|
tmp_data1 = (dst1_offset & 3) == 0 ? (char4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
|
|
|
|
data0 = (char4)(src_data_2, src_data_5, src_data_8, src_data_11);
|
|
data1 = (dst2_offset & 3) == 2 ? (char4)(src_data_5, src_data_8, src_data_11, src_data_14) : data0;
|
|
data2 = (dst2_offset & 3) == 1 ? (char4)(src_data_8, src_data_11, src_data_14, src_data_17) : data1;
|
|
tmp_data2 = (dst2_offset & 3) == 0 ? (char4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
|
|
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
|
|
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
|
|
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
|
|
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
|
|
|
|
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
|
|
tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
|
|
tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
|
|
tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
|
|
|
|
*((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
|
|
*((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
|
|
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 2;
|
|
|
|
#define dst0_align ((dst0_offset & 3) << 1)
|
|
#define dst1_align ((dst1_offset & 3) << 1)
|
|
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
|
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
|
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
|
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
|
char8 src_data_0 = vload8(0, mat_src + src_idx_0);
|
|
char8 src_data_1 = vload8(0, mat_src + src_idx_1);
|
|
if(src_idx_0 == -6)
|
|
src_data_0.s01234567 = src_data_0.s67012345;
|
|
if(src_idx_0 == -4)
|
|
src_data_0.s01234567 = src_data_0.s45670123;
|
|
if(src_idx_0 == -2)
|
|
src_data_0.s01234567 = src_data_0.s23456701;
|
|
if(src_idx_1 == -6)
|
|
src_data_1.s01234567 = src_data_1.s67012345;
|
|
if(src_idx_1 == -4)
|
|
src_data_1.s01234567 = src_data_1.s45670123;
|
|
if(src_idx_1 == -2)
|
|
src_data_1.s01234567 = src_data_1.s23456701;
|
|
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
|
|
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
|
|
|
|
char4 tmp_data0, tmp_data1;
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
|
|
tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
|
|
tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
|
|
tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
|
|
tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
|
|
|
|
*((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
|
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
|
__global ushort *mat_dst3, int dst3_step, int dst3_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 1;
|
|
|
|
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
|
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
|
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
|
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
|
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
|
ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
|
|
if(src_idx_0 == -6)
|
|
src_data0.s01234567 = src_data0.s67012345;
|
|
if(src_idx_0 == -4)
|
|
src_data0.s01234567 = src_data0.s45670123;
|
|
if(src_idx_0 == -2)
|
|
src_data0.s01234567 = src_data0.s23456701;
|
|
ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
|
|
|
|
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
|
|
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
|
|
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
|
|
ushort2 dst3_data = *((__global ushort2 *)((__global char *)mat_dst3 + dst3_idx));
|
|
|
|
ushort2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
|
|
|
|
tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data0.s4, src_data1.s0) : (ushort2)(src_data0.s0, src_data0.s4);
|
|
tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data0.s5, src_data1.s1) : (ushort2)(src_data0.s1, src_data0.s5);
|
|
tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data0.s6, src_data1.s2) : (ushort2)(src_data0.s2, src_data0.s6);
|
|
tmp_data3 = (dst3_offset & 3) == 0 ? (ushort2)(src_data0.s7, src_data1.s3) : (ushort2)(src_data0.s3, src_data0.s7);
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
|
|
|
|
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
|
|
tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
|
|
|
|
tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
|
|
tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
|
|
|
|
*((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
|
|
*((global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
|
|
*((global ushort2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
|
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 1;
|
|
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
|
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
|
|
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
|
|
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
|
|
|
|
ushort2 tmp_data0, tmp_data1, tmp_data2;
|
|
|
|
ushort src_data_0 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 3];
|
|
ushort src_data_1 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 2];
|
|
ushort src_data_2 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 1];
|
|
ushort src_data_3 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 0];
|
|
ushort src_data_4 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 1];
|
|
ushort src_data_5 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 2];
|
|
ushort src_data_6 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 3];
|
|
ushort src_data_7 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 4];
|
|
ushort src_data_8 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 5];
|
|
|
|
tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data_3, src_data_6) : (ushort2)(src_data_0, src_data_3);
|
|
tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data_4, src_data_7) : (ushort2)(src_data_1, src_data_4);
|
|
tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data_5, src_data_8) : (ushort2)(src_data_2, src_data_5);
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
|
|
|
|
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
|
|
tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
|
|
|
|
*((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
|
|
*((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
|
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 1;
|
|
|
|
#define dst0_align ((dst0_offset & 3) << 1)
|
|
#define dst1_align ((dst1_offset & 3) << 1)
|
|
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
|
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
|
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
|
ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
|
|
ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
|
|
if(src_idx_0 < 0)
|
|
{
|
|
ushort4 tmp;
|
|
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
|
|
src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
|
|
}
|
|
if(src_idx_1 < 0)
|
|
{
|
|
ushort4 tmp;
|
|
tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
|
|
src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
|
|
}
|
|
|
|
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
|
|
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
|
|
|
|
ushort2 tmp_data0, tmp_data1;
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
|
|
|
|
*((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
|
|
}
|
|
}
|
|
__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
|
|
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
|
__global short *mat_dst3, int dst3_step, int dst3_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 1;
|
|
|
|
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
|
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
|
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
|
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
|
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
|
|
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
|
short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
|
|
|
|
if(src_idx_0 == -6)
|
|
src_data0.s01234567 = src_data0.s67012345;
|
|
if(src_idx_0 == -4)
|
|
src_data0.s01234567 = src_data0.s45670123;
|
|
if(src_idx_0 == -2)
|
|
src_data0.s01234567 = src_data0.s23456701;
|
|
|
|
short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
|
|
|
|
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
|
|
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
|
|
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
|
|
short2 dst3_data = *((__global short2 *)((__global char *)mat_dst3 + dst3_idx));
|
|
|
|
short2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
|
|
|
|
tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data0.s4, src_data1.s0) : (short2)(src_data0.s0, src_data0.s4);
|
|
tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data0.s5, src_data1.s1) : (short2)(src_data0.s1, src_data0.s5);
|
|
tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data0.s6, src_data1.s2) : (short2)(src_data0.s2, src_data0.s6);
|
|
tmp_data3 = (dst3_offset & 3) == 0 ? (short2)(src_data0.s7, src_data1.s3) : (short2)(src_data0.s3, src_data0.s7);
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
|
|
|
|
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
|
|
tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
|
|
|
|
tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
|
|
tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
|
|
|
|
*((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
|
|
*((global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
|
|
*((global short2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
|
|
}
|
|
}
|
|
__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
|
|
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 1;
|
|
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
|
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
|
|
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
|
|
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
|
|
|
|
short2 tmp_data0, tmp_data1, tmp_data2;
|
|
|
|
short src_data_0 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 3];
|
|
short src_data_1 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 2];
|
|
short src_data_2 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 1];
|
|
short src_data_3 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 0];
|
|
short src_data_4 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 1];
|
|
short src_data_5 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 2];
|
|
short src_data_6 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 3];
|
|
short src_data_7 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 4];
|
|
short src_data_8 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 5];
|
|
|
|
tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data_3, src_data_6) : (short2)(src_data_0, src_data_3);
|
|
tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data_4, src_data_7) : (short2)(src_data_1, src_data_4);
|
|
tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data_5, src_data_8) : (short2)(src_data_2, src_data_5);
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
|
|
|
|
tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
|
|
tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
|
|
|
|
*((__global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((__global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
|
|
*((__global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
|
|
}
|
|
}
|
|
|
|
|
|
__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
|
|
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
x = x << 1;
|
|
|
|
#define dst0_align ((dst0_offset & 3) << 1)
|
|
#define dst1_align ((dst1_offset & 3) << 1)
|
|
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
|
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
|
|
|
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
|
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
|
|
|
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
|
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
|
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
|
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
|
short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
|
|
short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
|
|
if(src_idx_0 < 0)
|
|
{
|
|
short4 tmp;
|
|
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
|
|
src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
|
|
}
|
|
if(src_idx_1< 0)
|
|
{
|
|
short4 tmp;
|
|
tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
|
|
src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
|
|
}
|
|
|
|
|
|
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
|
|
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
|
|
|
|
short2 tmp_data0, tmp_data1;
|
|
|
|
tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
|
|
tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
|
|
|
|
tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
|
|
tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
|
|
|
|
*((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
|
|
*((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
|
|
}
|
|
}
|
|
__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
|
|
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
|
__global int *mat_dst3, int dst3_step, int dst3_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
|
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
|
|
|
int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
|
|
|
|
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
|
((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
|
|
((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
|
|
((__global int *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
|
|
}
|
|
}
|
|
__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
|
|
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
|
|
|
int src_data_0 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 0];
|
|
int src_data_1 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 1];
|
|
int src_data_2 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 2];
|
|
|
|
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
|
|
((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
|
|
((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
|
|
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
|
|
int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
|
|
|
|
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
|
((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
|
|
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
|
__global float *mat_dst3, int dst3_step, int dst3_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
|
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
|
|
|
float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
|
|
|
|
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
|
((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
|
|
((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
|
|
((__global float *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
|
|
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
|
|
|
float src_data_0 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 0];
|
|
float src_data_1 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 1];
|
|
float src_data_2 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 2];
|
|
|
|
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
|
|
((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
|
|
((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
|
|
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
|
|
float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
|
|
|
|
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
|
((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
|
|
}
|
|
}
|
|
|
|
#if defined (DOUBLE_SUPPORT)
|
|
__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
|
|
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
|
__global double *mat_dst3, int dst3_step, int dst3_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
|
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
|
|
|
double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
|
|
|
|
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
|
((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
|
|
((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
|
|
((__global double *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
|
|
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
|
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
|
|
|
double src_data_0 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 0];
|
|
double src_data_1 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 1];
|
|
double src_data_2 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 2];
|
|
|
|
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
|
|
((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
|
|
((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
|
|
}
|
|
}
|
|
|
|
__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
|
|
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
|
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
|
int rows, int cols, int dst_step1)
|
|
|
|
{
|
|
int x = get_global_id(0);
|
|
int y = get_global_id(1);
|
|
|
|
if((x < cols) && (y < rows))
|
|
{
|
|
int src_idx = mad24(y, src_step, src_offset);
|
|
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
|
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
|
|
|
double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
|
|
|
|
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
|
((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
|
|
}
|
|
}
|
|
#endif
|