mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 14:13:15 +08:00
Allow OpenCL acceleration in every OpenCV module
This commit is contained in:
parent
d28df08eb0
commit
91ac9688a8
@ -782,7 +782,7 @@ if(HAVE_CUDA)
|
||||
status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO)
|
||||
endif()
|
||||
|
||||
if(HAVE_OPENCL AND BUILD_opencv_ocl)
|
||||
if(HAVE_OPENCL)
|
||||
status("")
|
||||
status(" OpenCL")
|
||||
if(OPENCL_INCLUDE_DIR)
|
||||
|
@ -432,10 +432,22 @@ macro(ocv_glob_module_sources)
|
||||
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
|
||||
file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
|
||||
|
||||
file(GLOB cl_kernels "src/opencl/*.cl")
|
||||
|
||||
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
|
||||
source_group("Include" FILES ${lib_hdrs})
|
||||
source_group("Include\\detail" FILES ${lib_hdrs_detail})
|
||||
|
||||
if(HAVE_OPENCL AND cl_kernels)
|
||||
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp"
|
||||
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
|
||||
DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
|
||||
source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
|
||||
list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
|
||||
endif()
|
||||
|
||||
ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs})
|
||||
endmacro()
|
||||
|
||||
@ -449,6 +461,9 @@ macro(ocv_create_module)
|
||||
|
||||
if(NOT "${ARGN}" STREQUAL "SKIP_LINK")
|
||||
target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
|
||||
if(HAVE_OPENCL AND OPENCL_LIBRARIES)
|
||||
target_link_libraries(${the_module} ${OPENCL_LIBRARIES})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_dependencies(opencv_modules ${the_module})
|
||||
|
@ -3,45 +3,5 @@ if(NOT HAVE_OPENCL)
|
||||
endif()
|
||||
|
||||
set(the_description "OpenCL-accelerated Computer Vision")
|
||||
ocv_add_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
|
||||
ocv_module_include_directories(${OPENCL_INCLUDE_DIRS})
|
||||
|
||||
file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl")
|
||||
set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
|
||||
set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.cmake")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${kernels_cpp}
|
||||
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" -DOUTPUT="${kernels_cpp}" -P ${cl2cpp_script}
|
||||
DEPENDS ${CL_FILES} ${cl2cpp_script})
|
||||
|
||||
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
|
||||
file(GLOB lib_srcs "src/*.cpp")
|
||||
file(GLOB lib_int_hdrs "src/*.h*")
|
||||
|
||||
source_group("Include" FILES ${lib_hdrs})
|
||||
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp})
|
||||
|
||||
ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
|
||||
|
||||
ocv_set_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_int_hdrs} ${lib_srcs} ${kernels_cpp})
|
||||
ocv_create_module(${OPENCL_LIBRARIES})
|
||||
ocv_add_precompiled_headers(${the_module})
|
||||
|
||||
################################################################################################################
|
||||
################################ OpenCL Module Tests ##################################################
|
||||
################################################################################################################
|
||||
file(GLOB test_srcs "test/*.cpp")
|
||||
file(GLOB test_hdrs "test/*.hpp" "test/*.h")
|
||||
|
||||
ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
|
||||
FILES "Src" ${test_srcs})
|
||||
|
||||
################################################################################################################
|
||||
################################ OpenCL Module Performance ##################################################
|
||||
################################################################################################################
|
||||
file(GLOB perf_srcs "perf/*.cpp")
|
||||
file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
|
||||
|
||||
ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
|
||||
FILES "Src" ${perf_srcs})
|
||||
|
@ -1,865 +0,0 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#define MAX_FLOAT 1e7f
|
||||
|
||||
int bit1Count(float x)
|
||||
{
|
||||
int c = 0;
|
||||
int ix = (int)x;
|
||||
|
||||
for (int i = 0 ; i < 32 ; i++)
|
||||
{
|
||||
c += ix & 0x1;
|
||||
ix >>= 1;
|
||||
}
|
||||
|
||||
return (float)c;
|
||||
}
|
||||
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
|
||||
local size: dim0 is block_size, dim1 is block_size.
|
||||
*/
|
||||
__kernel void BruteForceMatch_UnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||
|
||||
int queryIdx = groupidx * block_size + lidy;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
}
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
// loopUnrolledCached to find the best trainIdx and best distance.
|
||||
volatile int imgIdx = 0;
|
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
__local float *s_distance = (__local float *)(sharebuffer);
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//find BestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_Match(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
// loop
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
//Dist dist;
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * block_size;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * block_size + lidx] = 0;
|
||||
s_train[lidx * block_size + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//myBestImgidx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
//radius_unrollmatch
|
||||
__kernel void BruteForceMatch_RadiusUnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * block_size + lidy;
|
||||
const int trainIdx = groupidx * block_size + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if (ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//radius_match
|
||||
__kernel void BruteForceMatch_RadiusMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * block_size + lidy;
|
||||
const int trainIdx = groupidx * block_size + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if (ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void BruteForceMatch_knnUnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
}
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loopUnrolledCached
|
||||
volatile int imgIdx = 0;
|
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
const int loadX = lidx + i * block_size;
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
local float *s_distance = (local float *)sharebuffer;
|
||||
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
// find BestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_knnMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loop
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0.0f;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * block_size;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * block_size + lidx] = 0;
|
||||
s_train[lidx * block_size + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistance(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_findBestMatch(
|
||||
__global float *allDist,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
int k,
|
||||
int block_size
|
||||
)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
@ -61,29 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data ,src2_data;
|
||||
uchar4 src1_data ,src2_data;
|
||||
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
|
||||
@ -117,14 +117,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -177,14 +177,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -236,18 +236,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
|
||||
|
||||
#define dst_align ((dst_offset >> bitOfInt) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
|
||||
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
|
||||
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
|
||||
@ -256,7 +256,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
@ -299,16 +299,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
@ -361,16 +361,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
|
@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -75,14 +75,14 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -113,8 +113,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -126,14 +126,14 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
char4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -164,8 +164,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -177,14 +177,14 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -216,8 +216,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -229,14 +229,14 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -320,4 +320,3 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -72,7 +72,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = ~ src1_data;
|
||||
|
||||
|
||||
/* if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
@ -102,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -136,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -171,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -245,14 +245,13 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
|
||||
{
|
||||
int src_index = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
|
||||
char8 data;
|
||||
|
||||
data = *((__global char8 *)((__global char *)src + src_index));
|
||||
data = ~ data;
|
||||
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -111,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -148,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -276,4 +276,3 @@ __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -911,4 +911,3 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1078,4 +1078,3 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -76,14 +76,14 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -113,8 +113,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -126,14 +126,14 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
|
||||
char4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -164,8 +164,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -177,14 +177,14 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -216,8 +216,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -231,14 +231,14 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -324,4 +324,3 @@ __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -63,31 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
|
||||
@ -115,29 +115,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
@ -166,32 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
|
||||
@ -215,32 +215,32 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
@ -266,22 +266,22 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
@ -308,29 +308,29 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
@ -359,31 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
|
||||
@ -410,31 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
|
||||
@ -463,29 +463,29 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -512,31 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
|
||||
@ -561,29 +561,29 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
@ -610,29 +610,29 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
@ -661,30 +661,30 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -715,30 +715,30 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@ -770,30 +770,30 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -821,30 +821,30 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
|
||||
|
||||
@ -870,30 +870,30 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
if(src1_index < 0)
|
||||
{
|
||||
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
|
||||
@ -921,28 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
} uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
} uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
@ -954,4 +954,3 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -59,29 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -111,29 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -163,29 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -211,30 +211,30 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -260,28 +260,28 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
|
||||
@ -307,29 +307,29 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -344,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/***********************************Compare LT*******************************/
|
||||
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
@ -359,29 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
@ -411,30 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
@ -464,29 +464,29 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -513,34 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
|
||||
@ -565,29 +565,29 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
@ -614,29 +614,29 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
@ -665,29 +665,29 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -718,29 +718,29 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -771,29 +771,29 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -820,29 +820,29 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
|
||||
@ -868,28 +868,28 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
|
||||
@ -916,29 +916,29 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
|
||||
@ -952,5 +952,3 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -455,5 +455,3 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
@ -125,16 +125,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
@ -148,8 +148,8 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
|
||||
src1_data.s01234567 = src1_data.s45670123;
|
||||
if(src1_index== -2)
|
||||
src1_data.s01234567 = src1_data.s23456701;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
||||
|
@ -240,4 +240,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
|
||||
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
|
||||
}
|
||||
}
|
||||
|
@ -194,4 +194,3 @@ __kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int el
|
||||
dst[gid + groupnum] = localmem_max[0];
|
||||
}
|
||||
}
|
||||
|
@ -203,4 +203,3 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
|
||||
dst[gid] = localmem_sum[0];
|
||||
}
|
||||
}
|
||||
|
@ -245,4 +245,3 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
|
||||
dst[gid*3+2] = localmem_sum3[0];
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,7 @@
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Liu Liujun, liujun@multicorewareinc.com
|
||||
// Liu Liujun, liujun@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -61,7 +61,7 @@ __kernel void BlendLinear_C1_D0(
|
||||
int pos = mad24(idy,istep >> 2,idx);
|
||||
int wpos = mad24(idy,wstep >> 2,idx);
|
||||
float4 w1 = weight1[wpos], w2 = weight2[wpos];
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
|
||||
}
|
||||
}
|
||||
@ -86,7 +86,7 @@ __kernel void BlendLinear_C4_D0(
|
||||
int wpos = mad24(idy,wstep, idx);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
|
||||
}
|
||||
}
|
||||
@ -138,4 +138,3 @@ __kernel void BlendLinear_C4_D5(
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
|
865
modules/ocl/src/opencl/brute_force_match.cl
Normal file
865
modules/ocl/src/opencl/brute_force_match.cl
Normal file
@ -0,0 +1,865 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#define MAX_FLOAT 1e7f
|
||||
|
||||
int bit1Count(float x)
|
||||
{
|
||||
int c = 0;
|
||||
int ix = (int)x;
|
||||
|
||||
for (int i = 0 ; i < 32 ; i++)
|
||||
{
|
||||
c += ix & 0x1;
|
||||
ix >>= 1;
|
||||
}
|
||||
|
||||
return (float)c;
|
||||
}
|
||||
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
|
||||
local size: dim0 is block_size, dim1 is block_size.
|
||||
*/
|
||||
__kernel void BruteForceMatch_UnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||
|
||||
int queryIdx = groupidx * block_size + lidy;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
}
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
// loopUnrolledCached to find the best trainIdx and best distance.
|
||||
volatile int imgIdx = 0;
|
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
__local float *s_distance = (__local float *)(sharebuffer);
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//find BestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_Match(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
// loop
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
//Dist dist;
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * block_size;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * block_size + lidx] = 0;
|
||||
s_train[lidx * block_size + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//myBestImgidx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
//radius_unrollmatch
|
||||
__kernel void BruteForceMatch_RadiusUnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * block_size + lidy;
|
||||
const int trainIdx = groupidx * block_size + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if (ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//radius_match
|
||||
__kernel void BruteForceMatch_RadiusMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * block_size + lidy;
|
||||
const int trainIdx = groupidx * block_size + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if (ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void BruteForceMatch_knnUnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
}
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loopUnrolledCached
|
||||
volatile int imgIdx = 0;
|
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
const int loadX = lidx + i * block_size;
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
local float *s_distance = (local float *)sharebuffer;
|
||||
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
// find BestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_knnMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loop
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0.0f;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * block_size;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * block_size + lidx] = 0;
|
||||
s_train[lidx * block_size + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistance(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_findBestMatch(
|
||||
__global float *allDist,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
int k,
|
||||
int block_size
|
||||
)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
@ -234,4 +234,3 @@ __kernel
|
||||
map_y[y * step_y + x] = ycoo;
|
||||
}
|
||||
}
|
||||
|
@ -466,5 +466,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -559,7 +559,3 @@ if(result)
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
@ -283,4 +283,3 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
|
||||
newnode[counter].alpha[0] = t1.alpha[0];
|
||||
newnode[counter].alpha[1] = t1.alpha[1];
|
||||
}
|
||||
|
@ -107,5 +107,3 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
|
||||
dst[gy*(dst_step >> 2)+gx] = res;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -267,4 +267,3 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
@ -484,4 +484,3 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, i
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
@ -48,7 +48,7 @@
|
||||
#if defined DOUBLE_SUPPORT
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
typedef double4 F4 ;
|
||||
#else
|
||||
#else
|
||||
typedef float4 F4;
|
||||
#endif
|
||||
|
||||
@ -62,7 +62,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
@ -79,7 +79,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
|
||||
|
||||
|
||||
uchar4 con = convert_uchar4(convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows) || convert_int4(map1_data.even) < (int4)(0) || convert_int4(map1_data.odd) < (int4)(0));
|
||||
uchar4 src_data = val;
|
||||
|
||||
@ -91,12 +91,12 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
src_data.s2 = *(src + srcIdx.s2);
|
||||
if (con.s3 == 0)
|
||||
src_data.s3 = *(src + srcIdx.s3);
|
||||
|
||||
|
||||
uchar4 dst_data;
|
||||
|
||||
|
||||
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *d;
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
|
||||
@ -113,7 +113,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
@ -131,9 +131,9 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
|
||||
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
|
||||
int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
|
||||
|
||||
|
||||
uchar4 src_data = val;
|
||||
uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0));
|
||||
uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0));
|
||||
|
||||
if (con.s0 == 0)
|
||||
src_data.s0 = *(src + srcIdx.s0);
|
||||
@ -147,10 +147,10 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
// dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
|
||||
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *d;
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
|
||||
|
||||
dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
|
||||
*d = dst_data;
|
||||
}
|
||||
@ -162,7 +162,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
@ -183,9 +183,9 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
|
||||
int8 map_dataZ = convert_int8_sat_rte(map_data);
|
||||
int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset;
|
||||
|
||||
|
||||
uchar4 src_data = val;
|
||||
uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0));
|
||||
uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0));
|
||||
|
||||
if (con.s0 == 0)
|
||||
src_data.s0 = *(src + srcIdx.s0);
|
||||
@ -196,14 +196,14 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
if (con.s3 == 0)
|
||||
src_data.s3 = *(src + srcIdx.s3);
|
||||
uchar4 dst_data;
|
||||
|
||||
|
||||
// dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
|
||||
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *d;
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
|
||||
|
||||
dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
|
||||
*d = dst_data;
|
||||
}
|
||||
@ -272,7 +272,7 @@ __kernel void remapNNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
|
||||
int y = get_global_id(1);
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
{
|
||||
int dstIdx = y * dst_step + (x << 2) + dst_offset;
|
||||
int mapIdx = y * map1_step + (x << 2) + map1_offset;
|
||||
float map1_data = *((__global float *)((__global char*)map1 + mapIdx));
|
||||
@ -294,7 +294,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
int dstIdx = y * dst_step + (x << 2) + dst_offset;
|
||||
@ -309,7 +309,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
src_data = *((__global float *)((__global uchar *)src + srcIdx));
|
||||
*((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -321,7 +321,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
int dstIdx = y * dst_step + (x << 2) + dst_offset;
|
||||
@ -337,7 +337,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
src_data = *((__global float *)((__global uchar *)src + srcIdx));
|
||||
*((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -348,7 +348,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
int dstIdx = y * dst_step + (x << 2) + dst_offset;
|
||||
@ -367,7 +367,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
src_data = *((__global float *)((__global uchar *)src + srcIdx));
|
||||
*((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -391,9 +391,9 @@ __kernel void remapNNSConstant_C4_D5(__global float * dst, __global float const
|
||||
src_data = nval;
|
||||
else
|
||||
src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
|
||||
*((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
|
||||
*((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -454,13 +454,13 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
x = x << 2;
|
||||
int gx = x - (dst_offset&3);
|
||||
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
|
||||
|
||||
uchar4 nval =convert_uchar4(nVal);
|
||||
uchar4 val = (uchar4)(nval.s0);
|
||||
|
||||
|
||||
|
||||
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
|
||||
|
||||
@ -518,12 +518,12 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
|
||||
if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
|
||||
d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
|
||||
|
||||
|
||||
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
|
||||
|
||||
|
||||
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *D;
|
||||
uchar4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
|
||||
@ -540,13 +540,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
x = x << 2;
|
||||
int gx = x - (dst_offset&3);
|
||||
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
|
||||
|
||||
uchar4 nval =convert_uchar4(nVal);
|
||||
uchar4 val = (uchar4)(nval.s0);
|
||||
|
||||
|
||||
|
||||
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
|
||||
|
||||
@ -607,13 +607,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
|
||||
if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
|
||||
d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
|
||||
|
||||
|
||||
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *D;
|
||||
uchar4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
|
||||
@ -725,13 +725,13 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 4;
|
||||
x = x << 4;
|
||||
int gx = x - (dst_offset&15);
|
||||
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
|
||||
|
||||
float4 nval =convert_float4(nVal);
|
||||
float4 val = (float4)(nval.s0);
|
||||
|
||||
|
||||
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
|
||||
int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
|
||||
float8 map1_data;
|
||||
@ -787,12 +787,12 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
|
||||
if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
|
||||
d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
|
||||
|
||||
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
|
||||
|
||||
|
||||
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
|
||||
|
||||
float4 dVal = *D;
|
||||
float4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
|
||||
@ -809,13 +809,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 4;
|
||||
x = x << 4;
|
||||
int gx = x - (dst_offset&15);
|
||||
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
|
||||
|
||||
float4 nval =convert_float4(nVal);
|
||||
float4 val = (float4)(nval.s0);
|
||||
|
||||
|
||||
int dstStart = y * dst_step + x + dst_offset - (dst_offset & 15);
|
||||
int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
|
||||
float4 map1_data;
|
||||
@ -874,13 +874,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
|
||||
if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
|
||||
d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
|
||||
|
||||
|
||||
|
||||
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
|
||||
|
||||
|
||||
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
|
||||
|
||||
float4 dVal = *D;
|
||||
float4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
|
||||
@ -928,7 +928,7 @@ __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const
|
||||
else
|
||||
d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
|
||||
|
||||
float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
|
||||
float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
|
||||
*((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ;
|
||||
|
||||
}
|
||||
@ -974,12 +974,9 @@ __kernel void remapLNF1Constant_C4_D5(__global float * dst, __global float const
|
||||
else
|
||||
d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
|
||||
|
||||
float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
|
||||
float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
|
||||
*((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -411,4 +411,3 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
|
||||
dst[dpos] = src[spos];
|
||||
|
||||
}
|
||||
|
@ -150,4 +150,3 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -682,4 +682,3 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -821,4 +821,3 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
|
||||
res[res_idx] = normAcc(num, denum);
|
||||
}
|
||||
}
|
||||
|
@ -240,4 +240,3 @@ __kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
|
||||
// outsp[basesp] =(short2)((short)x0,(short)y0);
|
||||
}
|
||||
}
|
||||
|
@ -27,7 +27,7 @@ typedef long T;
|
||||
#define DST_ROW_A03 9
|
||||
|
||||
__kernel void icvContourMoments(int contour_total,
|
||||
__global float* reader_oclmat_data,
|
||||
__global float* reader_oclmat_data,
|
||||
__global T* dst_a,
|
||||
int dst_step)
|
||||
{
|
||||
@ -58,7 +58,7 @@ __kernel void icvContourMoments(int contour_total,
|
||||
dxy = xi_1 * yi - xi * yi_1;
|
||||
xii_1 = xi_1 + xi;
|
||||
yii_1 = yi_1 + yi;
|
||||
|
||||
|
||||
dst_step /= sizeof(T);
|
||||
*( dst_a + DST_ROW_A00 * dst_step + idx) = dxy;
|
||||
*( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1;
|
@ -104,11 +104,11 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
|
||||
// N = 2
|
||||
// for simple haar paatern
|
||||
float icvCalcHaarPatternSum_2(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
@ -137,11 +137,11 @@ float icvCalcHaarPatternSum_2(
|
||||
|
||||
// N = 3
|
||||
float icvCalcHaarPatternSum_3(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
@ -170,11 +170,11 @@ float icvCalcHaarPatternSum_3(
|
||||
|
||||
// N = 4
|
||||
float icvCalcHaarPatternSum_4(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
@ -265,7 +265,7 @@ __kernel void icvCalcLayerDetAndTrace(
|
||||
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
|
||||
|
||||
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
|
||||
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
|
||||
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
|
||||
}
|
||||
}
|
||||
|
||||
@ -301,9 +301,9 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
|
||||
// Non-maximal suppression to further filtering the candidates from previous step
|
||||
__kernel
|
||||
void icvFindMaximaInLayer_withmask(
|
||||
__global const float * det,
|
||||
__global const float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
__global const float * det,
|
||||
__global const float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
volatile __global int* maxCounter,
|
||||
int counter_offset,
|
||||
int det_step, // the step of det in bytes
|
||||
@ -345,26 +345,26 @@ __kernel
|
||||
// Is this thread within the hessian buffer?
|
||||
const int zoff = get_local_size(0) * get_local_size(1);
|
||||
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
|
||||
N9[localLin - zoff] =
|
||||
det[det_step *
|
||||
N9[localLin - zoff] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
N9[localLin ] =
|
||||
det[det_step *
|
||||
N9[localLin ] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
N9[localLin + zoff] =
|
||||
det[det_step *
|
||||
N9[localLin + zoff] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (i < c_layer_rows - margin
|
||||
if (i < c_layer_rows - margin
|
||||
&& j < c_layer_cols - margin
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) < get_local_size(0) - 1
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
|
||||
)
|
||||
{
|
||||
@ -429,9 +429,9 @@ __kernel
|
||||
|
||||
__kernel
|
||||
void icvFindMaximaInLayer(
|
||||
__global float * det,
|
||||
__global float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
__global float * det,
|
||||
__global float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
volatile __global int* maxCounter,
|
||||
int counter_offset,
|
||||
int det_step, // the step of det in bytes
|
||||
@ -474,19 +474,19 @@ __kernel
|
||||
int l_x = min(max(j, 0), c_img_cols - 1);
|
||||
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
|
||||
|
||||
N9[localLin - zoff] =
|
||||
N9[localLin - zoff] =
|
||||
det[det_step * (l_y - c_layer_rows) + l_x];
|
||||
N9[localLin ] =
|
||||
N9[localLin ] =
|
||||
det[det_step * (l_y ) + l_x];
|
||||
N9[localLin + zoff] =
|
||||
N9[localLin + zoff] =
|
||||
det[det_step * (l_y + c_layer_rows) + l_x];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (i < c_layer_rows - margin
|
||||
if (i < c_layer_rows - margin
|
||||
&& j < c_layer_cols - margin
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) < get_local_size(0) - 1
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
|
||||
)
|
||||
{
|
||||
@ -554,17 +554,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
|
||||
{
|
||||
F invdet = 1.0 / det;
|
||||
|
||||
x[0] = invdet *
|
||||
x[0] = invdet *
|
||||
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
|
||||
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
|
||||
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
|
||||
|
||||
x[1] = invdet *
|
||||
x[1] = invdet *
|
||||
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
|
||||
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
|
||||
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
|
||||
|
||||
x[2] = invdet *
|
||||
x[2] = invdet *
|
||||
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
|
||||
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
|
||||
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
|
||||
@ -585,9 +585,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// INTERPOLATION
|
||||
__kernel
|
||||
__kernel
|
||||
void icvInterpolateKeypoint(
|
||||
__global const float * det,
|
||||
__global const float * det,
|
||||
__global const int4 * maxPosBuffer,
|
||||
__global float * keypoints,
|
||||
volatile __global int * featureCounter,
|
||||
@ -617,7 +617,7 @@ __kernel
|
||||
|
||||
volatile __local float N9[3][3][3];
|
||||
|
||||
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
|
||||
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
|
||||
det[det_step * (c_layer_rows * layer + i) + j];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@ -715,27 +715,27 @@ __kernel
|
||||
|
||||
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
|
||||
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
|
||||
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
|
||||
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
|
||||
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
|
||||
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
|
||||
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
|
||||
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
|
||||
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
|
||||
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
|
||||
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
|
||||
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
|
||||
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
|
||||
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
|
||||
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
|
||||
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
|
||||
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
|
||||
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
|
||||
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
|
||||
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
|
||||
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
|
||||
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
|
||||
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
|
||||
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
|
||||
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
|
||||
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
|
||||
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
|
||||
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
|
||||
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
|
||||
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
|
||||
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
|
||||
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
|
||||
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
|
||||
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
|
||||
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
|
||||
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
|
||||
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
|
||||
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
|
||||
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
|
||||
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
|
||||
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
|
||||
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
|
||||
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
|
||||
0.001707611023448408f, 0.001455130288377404f};
|
||||
|
||||
@ -748,13 +748,13 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
|
||||
data[tid] = *partial_reduction;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 16)
|
||||
if (tid < 16)
|
||||
{
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
#undef op
|
||||
}
|
||||
@ -958,8 +958,8 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
|
||||
|
||||
// utility for linear filter
|
||||
inline uchar readerGet(
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
int i, int j, int rows, int cols, int elemPerRow
|
||||
)
|
||||
{
|
||||
@ -969,8 +969,8 @@ inline uchar readerGet(
|
||||
}
|
||||
|
||||
inline float linearFilter(
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
float y, float x, int rows, int cols, int elemPerRow
|
||||
)
|
||||
{
|
||||
@ -1004,9 +1004,9 @@ void calc_dx_dy(
|
||||
volatile __local float s_dx_bin[25],
|
||||
volatile __local float s_dy_bin[25],
|
||||
volatile __local float s_PATCH[6][6],
|
||||
__global const float* featureX,
|
||||
__global const float* featureY,
|
||||
__global const float* featureSize,
|
||||
__global const float* featureX,
|
||||
__global const float* featureY,
|
||||
__global const float* featureSize,
|
||||
__global const float* featureDir,
|
||||
int rows,
|
||||
int cols,
|
||||
@ -1058,26 +1058,26 @@ void calc_dx_dy(
|
||||
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
|
||||
|
||||
const float vx = (
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
|
||||
* dw;
|
||||
const float vy = (
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
|
||||
* dw;
|
||||
s_dx_bin[tid] = vx;
|
||||
s_dy_bin[tid] = vy;
|
||||
}
|
||||
}
|
||||
void reduce_sum25(
|
||||
volatile __local float* sdata1,
|
||||
volatile __local float* sdata2,
|
||||
volatile __local float* sdata3,
|
||||
volatile __local float* sdata4,
|
||||
volatile __local float* sdata1,
|
||||
volatile __local float* sdata2,
|
||||
volatile __local float* sdata3,
|
||||
volatile __local float* sdata4,
|
||||
int tid
|
||||
)
|
||||
{
|
||||
@ -1115,13 +1115,13 @@ void reduce_sum25(
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__kernel
|
||||
void compute_descriptors64(
|
||||
IMAGE_INT8 imgTex,
|
||||
volatile __global float * descriptors,
|
||||
volatile __global float * descriptors,
|
||||
__global const float * keypoints,
|
||||
int descriptors_step,
|
||||
int keypoints_step,
|
||||
int keypoints_step,
|
||||
int rows,
|
||||
int cols,
|
||||
int img_step
|
||||
@ -1155,7 +1155,7 @@ __kernel
|
||||
if (tid < 25)
|
||||
{
|
||||
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 25)
|
||||
{
|
||||
@ -1171,10 +1171,10 @@ __kernel
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel
|
||||
__kernel
|
||||
void compute_descriptors128(
|
||||
IMAGE_INT8 imgTex,
|
||||
__global volatile float * descriptors,
|
||||
__global volatile float * descriptors,
|
||||
__global float * keypoints,
|
||||
int descriptors_step,
|
||||
int keypoints_step,
|
||||
@ -1269,7 +1269,7 @@ __kernel
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__kernel
|
||||
void normalize_descriptors128(__global float * descriptors, int descriptors_step)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
||||
@ -1310,7 +1310,7 @@ __kernel
|
||||
// normalize and store in output
|
||||
descriptor_base[get_local_id(0)] = lookup / len;
|
||||
}
|
||||
__kernel
|
||||
__kernel
|
||||
void normalize_descriptors64(__global float * descriptors, int descriptors_step)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
@ -57,4 +57,3 @@ __kernel void set_to_with_mask(
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -51,9 +51,9 @@
|
||||
////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global uchar *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
||||
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
|
||||
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
|
||||
|
||||
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
|
||||
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
|
||||
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
|
||||
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
|
||||
|
||||
int total_bytes = src_offset + rows * src_step;
|
||||
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
|
||||
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
|
||||
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
|
||||
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
|
||||
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
|
||||
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
|
||||
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
|
||||
|
||||
int total_bytes = src_offset + rows * src_step;
|
||||
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
|
||||
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
|
||||
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
|
||||
|
||||
uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
|
||||
|
||||
@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
||||
|
||||
|
||||
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
|
||||
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
|
||||
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
|
||||
@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s
|
||||
|
||||
uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
|
||||
int index = 3 - dst0_offset & 3;
|
||||
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
||||
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
||||
|
||||
uchar4 data0, data1, data2;
|
||||
|
||||
|
||||
data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
|
||||
data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
|
||||
data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
|
||||
@ -263,33 +263,33 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst0_align ((dst0_offset & 3) << 1)
|
||||
#define dst1_align ((dst1_offset & 3) << 1)
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
||||
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
||||
uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
|
||||
uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
|
||||
if(src_idx_0 == -6)
|
||||
@ -326,9 +326,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global char *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@ -336,35 +336,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
||||
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
|
||||
|
||||
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
|
||||
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
|
||||
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
|
||||
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
|
||||
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
|
||||
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
|
||||
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
|
||||
|
||||
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
|
||||
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
|
||||
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
|
||||
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
|
||||
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
|
||||
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
|
||||
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
|
||||
|
||||
char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
|
||||
|
||||
@ -437,33 +437,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
||||
|
||||
|
||||
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
|
||||
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
|
||||
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
|
||||
@ -500,10 +500,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr
|
||||
|
||||
char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
|
||||
int index = 3 - dst0_offset & 3;
|
||||
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
||||
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
||||
|
||||
char4 data0, data1, data2;
|
||||
|
||||
|
||||
data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
|
||||
data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
|
||||
data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
|
||||
@ -536,32 +536,32 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst0_align ((dst0_offset & 3) << 1)
|
||||
#define dst1_align ((dst1_offset & 3) << 1)
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
||||
char8 src_data_0 = vload8(0, mat_src + src_idx_0);
|
||||
char8 src_data_1 = vload8(0, mat_src + src_idx_1);
|
||||
if(src_idx_0 == -6)
|
||||
@ -597,9 +597,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int sr
|
||||
}
|
||||
|
||||
__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global ushort *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@ -607,30 +607,30 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
||||
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
||||
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
||||
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
|
||||
if(src_idx_0 == -6)
|
||||
src_data0.s01234567 = src_data0.s67012345;
|
||||
@ -672,33 +672,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
|
||||
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
|
||||
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
|
||||
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
|
||||
@ -735,48 +735,48 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst0_align ((dst0_offset & 3) << 1)
|
||||
#define dst1_align ((dst1_offset & 3) << 1)
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
||||
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
||||
ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
|
||||
ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
|
||||
if(src_idx_0 < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
|
||||
src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src_idx_1 < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
|
||||
src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src_idx_0 < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
|
||||
src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src_idx_1 < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
|
||||
src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
|
||||
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
|
||||
|
||||
@ -793,9 +793,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int
|
||||
}
|
||||
}
|
||||
__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global short *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@ -803,38 +803,38 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
||||
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
||||
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
||||
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
|
||||
|
||||
|
||||
if(src_idx_0 == -6)
|
||||
src_data0.s01234567 = src_data0.s67012345;
|
||||
if(src_idx_0 == -4)
|
||||
src_data0.s01234567 = src_data0.s45670123;
|
||||
if(src_idx_0 == -2)
|
||||
src_data0.s01234567 = src_data0.s23456701;
|
||||
|
||||
|
||||
short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
|
||||
|
||||
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
|
||||
@ -868,33 +868,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s
|
||||
}
|
||||
}
|
||||
__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
|
||||
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
|
||||
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
|
||||
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
|
||||
@ -932,47 +932,47 @@ __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int s
|
||||
|
||||
|
||||
__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst0_align ((dst0_offset & 3) << 1)
|
||||
#define dst1_align ((dst1_offset & 3) << 1)
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
||||
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
|
||||
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
|
||||
short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
|
||||
short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
|
||||
if(src_idx_0 < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
|
||||
src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src_idx_1< 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
|
||||
src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src_idx_0 < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
|
||||
src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src_idx_1< 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
|
||||
src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
|
||||
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
|
||||
@ -990,9 +990,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int s
|
||||
}
|
||||
}
|
||||
__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global int *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@ -1000,14 +1000,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
||||
|
||||
|
||||
int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@ -1017,18 +1017,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src
|
||||
}
|
||||
}
|
||||
__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
@ -1044,20 +1044,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
|
||||
|
||||
int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@ -1066,9 +1066,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src
|
||||
}
|
||||
|
||||
__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global float *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@ -1076,14 +1076,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
||||
|
||||
|
||||
float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@ -1094,18 +1094,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
@ -1121,20 +1121,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
|
||||
|
||||
float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@ -1144,9 +1144,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int s
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global double *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@ -1154,14 +1154,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
||||
|
||||
|
||||
double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@ -1172,18 +1172,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
@ -1199,20 +1199,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
|
||||
|
||||
double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
@ -55,9 +55,9 @@ int SQ(int a)
|
||||
return a * a;
|
||||
}
|
||||
|
||||
unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
volatile __local unsigned int *col_ssd, int radius)
|
||||
{
|
||||
{
|
||||
unsigned int cache = 0;
|
||||
unsigned int cache2 = 0;
|
||||
|
||||
@ -77,7 +77,7 @@ unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
return col_ssd[0] + cache + cache2;
|
||||
}
|
||||
|
||||
uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
volatile __local unsigned int *col_ssd, int radius)
|
||||
{
|
||||
unsigned int ssd[N_DISPARITIES];
|
||||
@ -112,7 +112,7 @@ uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
return (uint2)(mssd, bestIdx);
|
||||
}
|
||||
|
||||
void StepDown(int idx1, int idx2, __global unsigned char* imageL,
|
||||
void StepDown(int idx1, int idx2, __global unsigned char* imageL,
|
||||
__global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius)
|
||||
{
|
||||
unsigned char leftPixel1;
|
||||
@ -179,8 +179,8 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL,
|
||||
col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
}
|
||||
|
||||
void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
|
||||
__global unsigned char* imageR, int d,
|
||||
void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
|
||||
__global unsigned char* imageR, int d,
|
||||
volatile __local unsigned int *col_ssd, int radius)
|
||||
{
|
||||
unsigned char leftPixel1;
|
||||
@ -215,15 +215,15 @@ void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imag
|
||||
col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
|
||||
}
|
||||
|
||||
__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
|
||||
__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
|
||||
__global unsigned int *cminSSDImage, int cminSSD_step,
|
||||
__global unsigned char *disp, int disp_step,int cwidth, int cheight,
|
||||
int img_step, int maxdisp, int radius,
|
||||
int img_step, int maxdisp, int radius,
|
||||
__local unsigned int *col_ssd_cache)
|
||||
{
|
||||
|
||||
volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
|
||||
volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
|
||||
volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
|
||||
|
||||
int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
|
||||
// int Y = get_group_id(1) * ROWSperTHREAD + radius;
|
||||
@ -266,8 +266,8 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
||||
int idx1 = y_tex * img_step + x_tex;
|
||||
int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex;
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
StepDown(idx1, idx2, left, right, d, col_ssd, radius);
|
||||
if (col_ssd_extra > 0)
|
||||
@ -276,7 +276,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
||||
|
||||
y_tex += 1;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (X < cwidth - radius && row < cheight - radius - Y)
|
||||
{
|
||||
@ -296,7 +296,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
||||
//////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
|
||||
__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
|
||||
int rows, int cols, int prefilterCap)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@ -304,7 +304,7 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned
|
||||
|
||||
if(x < cols && y < rows)
|
||||
{
|
||||
int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) +
|
||||
int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) +
|
||||
input[(y) * cols + (x-1)] * (-2) + input[(y) * cols + (x+1)] * (2) +
|
||||
input[(y+1) * cols + (x-1)] * (-1) + input[(y+1) * cols + (x+1)] * (1);
|
||||
|
||||
@ -325,10 +325,10 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
|
||||
int x1 = x==0? 0 : x-1;
|
||||
if(x < cols && y < rows)
|
||||
{
|
||||
conv = (float)input[(y1) * cols + (x1)] * (-1) + (float)input[(y1) * cols + (x+1)] * (1) +
|
||||
conv = (float)input[(y1) * cols + (x1)] * (-1) + (float)input[(y1) * cols + (x+1)] * (1) +
|
||||
(float)input[(y) * cols + (x1)] * (-2) + (float)input[(y) * cols + (x+1)] * (2) +
|
||||
(float)input[(y+1) * cols + (x1)] * (-1) + (float)input[(y+1) * cols + (x+1)] * (1);
|
||||
|
||||
|
||||
}
|
||||
return fabs(conv);
|
||||
}
|
||||
@ -359,9 +359,9 @@ float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
|
||||
}
|
||||
|
||||
#define RpT (2 * ROWSperTHREAD) // got experimentally
|
||||
__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols,
|
||||
int disp_step, __global unsigned char *input, int input_rows,
|
||||
int input_cols,int winsz, float threshold,
|
||||
__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols,
|
||||
int disp_step, __global unsigned char *input, int input_rows,
|
||||
int input_cols,int winsz, float threshold,
|
||||
__local float *cols_cache)
|
||||
{
|
||||
int winsz2 = winsz/2;
|
||||
@ -405,13 +405,13 @@ __kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, in
|
||||
|
||||
for(int y = beg_row + 1; y < end_row; ++y)
|
||||
{
|
||||
sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
|
||||
sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
|
||||
sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
|
||||
*cols = sum;
|
||||
|
||||
if (cols_extra)
|
||||
{
|
||||
sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
|
||||
sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
|
||||
+ sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
|
||||
*cols_extra = sum_extra;
|
||||
}
|
Loading…
Reference in New Issue
Block a user