Merge branch '2.4'

This commit is contained in:
Andrey Kamaev 2013-04-12 15:35:38 +04:00
commit 3b364330ad
75 changed files with 1259 additions and 3979 deletions

View File

@ -4,23 +4,43 @@ import android.os.IBinder;
public class BinderConnector public class BinderConnector
{ {
public BinderConnector(MarketConnector Market) public BinderConnector(MarketConnector Market) {
{ mMarket = Market;
Init(Market);
}
public native IBinder Connect();
public boolean Disconnect()
{
Final();
return true;
} }
static public boolean Init() {
boolean result = false;
if (mIsReady)
result = Init(mMarket);
return result;
}
public native IBinder Connect();
public boolean Disconnect()
{ {
System.loadLibrary("OpenCVEngine"); if (mIsReady)
System.loadLibrary("OpenCVEngine_jni"); Final();
return mIsReady;
} }
private native boolean Init(MarketConnector Market); private native boolean Init(MarketConnector Market);
public native void Final(); private native void Final();
private static boolean mIsReady = false;
private MarketConnector mMarket;
static {
try {
System.loadLibrary("OpenCVEngine");
System.loadLibrary("OpenCVEngine_jni");
mIsReady = true;
}
catch(UnsatisfiedLinkError e) {
mIsReady = false;
e.printStackTrace();
}
}
} }

View File

@ -47,9 +47,17 @@ public class HardwareDetector
public static native int DetectKnownPlatforms(); public static native int DetectKnownPlatforms();
static public static boolean mIsReady = false;
{
System.loadLibrary("OpenCVEngine"); static {
System.loadLibrary("OpenCVEngine_jni"); try {
System.loadLibrary("OpenCVEngine");
System.loadLibrary("OpenCVEngine_jni");
mIsReady = true;
}
catch(UnsatisfiedLinkError e) {
mIsReady = false;
e.printStackTrace();
}
} }
} }

View File

@ -3,31 +3,62 @@ package org.opencv.engine;
import android.app.Service; import android.app.Service;
import android.content.Intent; import android.content.Intent;
import android.os.IBinder; import android.os.IBinder;
import android.os.RemoteException;
import android.util.Log; import android.util.Log;
public class OpenCVEngineService extends Service public class OpenCVEngineService extends Service
{ {
private static final String TAG = "OpenCVEngine/Service"; private static final String TAG = "OpenCVEngine/Service";
private IBinder mEngineInterface; private IBinder mEngineInterface = null;
private MarketConnector mMarket; private MarketConnector mMarket;
private BinderConnector mNativeBinder; private BinderConnector mNativeBinder;
public void onCreate()
{ public void onCreate() {
Log.i(TAG, "Service starting"); Log.i(TAG, "Service starting");
super.onCreate(); super.onCreate();
Log.i(TAG, "Engine binder component creating"); Log.i(TAG, "Engine binder component creating");
mMarket = new MarketConnector(getBaseContext()); mMarket = new MarketConnector(getBaseContext());
mNativeBinder = new BinderConnector(mMarket); mNativeBinder = new BinderConnector(mMarket);
mEngineInterface = mNativeBinder.Connect(); if (mNativeBinder.Init()) {
Log.i(TAG, "Service started successfully"); mEngineInterface = mNativeBinder.Connect();
Log.i(TAG, "Service started successfully");
} else {
Log.e(TAG, "Cannot initialize native part of OpenCV Manager!");
Log.e(TAG, "Using stub instead");
mEngineInterface = new OpenCVEngineInterface.Stub() {
@Override
public boolean installVersion(String version) throws RemoteException {
// TODO Auto-generated method stub
return false;
}
@Override
public String getLibraryList(String version) throws RemoteException {
// TODO Auto-generated method stub
return null;
}
@Override
public String getLibPathByVersion(String version) throws RemoteException {
// TODO Auto-generated method stub
return null;
}
@Override
public int getEngineVersion() throws RemoteException {
return -1;
}
};
}
} }
public IBinder onBind(Intent intent) public IBinder onBind(Intent intent) {
{
Log.i(TAG, "Service onBind called for intent " + intent.toString()); Log.i(TAG, "Service onBind called for intent " + intent.toString());
return mEngineInterface; return mEngineInterface;
} }
public boolean onUnbind(Intent intent) public boolean onUnbind(Intent intent)
{ {
Log.i(TAG, "Service onUnbind called for intent " + intent.toString()); Log.i(TAG, "Service onUnbind called for intent " + intent.toString());

View File

@ -42,6 +42,26 @@ public class ManagerActivity extends Activity
@Override @Override
public void onCreate(Bundle savedInstanceState) { public void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState); super.onCreate(savedInstanceState);
if (!HardwareDetector.mIsReady) {
Log.e(TAG, "Cannot initialize native part of OpenCV Manager!");
AlertDialog dialog = new AlertDialog.Builder(this).create();
dialog.setTitle("OpenCV Manager Error");
dialog.setMessage("OpenCV Manager is incompatible with this device. Please replace it with an appropriate package.");
dialog.setCancelable(false);
dialog.setButton("OK", new DialogInterface.OnClickListener() {
public void onClick(DialogInterface dialog, int which) {
finish();
}
});
dialog.show();
return;
}
setContentView(R.layout.main); setContentView(R.layout.main);
TextView OsVersionView = (TextView)findViewById(R.id.OsVersionValue); TextView OsVersionView = (TextView)findViewById(R.id.OsVersionValue);
@ -186,6 +206,20 @@ public class ManagerActivity extends Activity
} }
}); });
mPackageChangeReciever = new BroadcastReceiver() {
@Override
public void onReceive(Context context, Intent intent) {
Log.d("OpenCVManager/Reciever", "Bradcast message " + intent.getAction() + " reciever");
Log.d("OpenCVManager/Reciever", "Filling package list on broadcast message");
if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE))
{
TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
EngineVersionView.setText("not avaliable");
}
}
};
IntentFilter filter = new IntentFilter(); IntentFilter filter = new IntentFilter();
filter.addAction(Intent.ACTION_PACKAGE_ADDED); filter.addAction(Intent.ACTION_PACKAGE_ADDED);
filter.addAction(Intent.ACTION_PACKAGE_CHANGED); filter.addAction(Intent.ACTION_PACKAGE_CHANGED);
@ -199,17 +233,23 @@ public class ManagerActivity extends Activity
@Override @Override
protected void onDestroy() { protected void onDestroy() {
super.onDestroy(); super.onDestroy();
unregisterReceiver(mPackageChangeReciever); if (mPackageChangeReciever != null)
unregisterReceiver(mPackageChangeReciever);
} }
@Override @Override
protected void onResume() { protected void onResume() {
super.onResume(); super.onResume();
Log.d(TAG, "Filling package list on resume"); if (HardwareDetector.mIsReady) {
if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE)) Log.d(TAG, "Filling package list on resume");
{ OpenCVEngineServiceConnection connection = new OpenCVEngineServiceConnection();
TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); if (!bindService(new Intent("org.opencv.engine.BIND"), connection, Context.BIND_AUTO_CREATE)) {
EngineVersionView.setText("not avaliable"); Log.e(TAG, "Cannot bind to OpenCV Manager service!");
TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
if (EngineVersionView != null)
EngineVersionView.setText("not avaliable");
unbindService(connection);
}
} }
} }
@ -225,19 +265,7 @@ public class ManagerActivity extends Activity
protected int ManagerApiLevel = 0; protected int ManagerApiLevel = 0;
protected String ManagerVersion; protected String ManagerVersion;
protected BroadcastReceiver mPackageChangeReciever = new BroadcastReceiver() { protected BroadcastReceiver mPackageChangeReciever = null;
@Override
public void onReceive(Context context, Intent intent) {
Log.d("OpenCVManager/Reciever", "Bradcast message " + intent.getAction() + " reciever");
Log.d("OpenCVManager/Reciever", "Filling package list on broadcast message");
if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE))
{
TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
EngineVersionView.setText("not avaliable");
}
}
};
protected class OpenCVEngineServiceConnection implements ServiceConnection protected class OpenCVEngineServiceConnection implements ServiceConnection
{ {
@ -246,6 +274,12 @@ public class ManagerActivity extends Activity
public void onServiceConnected(ComponentName name, IBinder service) { public void onServiceConnected(ComponentName name, IBinder service) {
OpenCVEngineInterface EngineService = OpenCVEngineInterface.Stub.asInterface(service); OpenCVEngineInterface EngineService = OpenCVEngineInterface.Stub.asInterface(service);
if (EngineService == null) {
Log.e(TAG, "Cannot connect to OpenCV Manager Service!");
unbindService(this);
return;
}
try { try {
ManagerApiLevel = EngineService.getEngineVersion(); ManagerApiLevel = EngineService.getEngineVersion();
} catch (RemoteException e) { } catch (RemoteException e) {

View File

@ -27,23 +27,23 @@ endif()
# the -fPIC flag should be used. # the -fPIC flag should be used.
# ---------------------------------------------------------------------------- # ----------------------------------------------------------------------------
if(UNIX) if(UNIX)
if (__ICL) if (__ICL)
set(CV_ICC __ICL) set(CV_ICC __ICL)
elseif(__ICC) elseif(__ICC)
set(CV_ICC __ICC) set(CV_ICC __ICC)
elseif(__ECL) elseif(__ECL)
set(CV_ICC __ECL) set(CV_ICC __ECL)
elseif(__ECC) elseif(__ECC)
set(CV_ICC __ECC) set(CV_ICC __ECC)
elseif(__INTEL_COMPILER) elseif(__INTEL_COMPILER)
set(CV_ICC __INTEL_COMPILER) set(CV_ICC __INTEL_COMPILER)
elseif(CMAKE_C_COMPILER MATCHES "icc") elseif(CMAKE_C_COMPILER MATCHES "icc")
set(CV_ICC icc_matches_c_compiler) set(CV_ICC icc_matches_c_compiler)
endif() endif()
endif() endif()
if(MSVC AND CMAKE_C_COMPILER MATCHES "icc") if(MSVC AND CMAKE_C_COMPILER MATCHES "icc")
set(CV_ICC __INTEL_COMPILER_FOR_WINDOWS) set(CV_ICC __INTEL_COMPILER_FOR_WINDOWS)
endif() endif()
# ---------------------------------------------------------------------------- # ----------------------------------------------------------------------------
@ -64,45 +64,49 @@ if(CMAKE_COMPILER_IS_CLANGCXX)
string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_CLANG_REGEX_VERSION "${CMAKE_OPENCV_CLANG_VERSION_FULL}") string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_CLANG_REGEX_VERSION "${CMAKE_OPENCV_CLANG_VERSION_FULL}")
elseif(CMAKE_COMPILER_IS_GNUCXX) elseif(CMAKE_COMPILER_IS_GNUCXX)
execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -dumpversion execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -dumpversion
OUTPUT_VARIABLE CMAKE_OPENCV_GCC_VERSION_FULL OUTPUT_VARIABLE CMAKE_OPENCV_GCC_VERSION_FULL
OUTPUT_STRIP_TRAILING_WHITESPACE) OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -v execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -v
ERROR_VARIABLE CMAKE_OPENCV_GCC_INFO_FULL ERROR_VARIABLE CMAKE_OPENCV_GCC_INFO_FULL
OUTPUT_STRIP_TRAILING_WHITESPACE) OUTPUT_STRIP_TRAILING_WHITESPACE)
# Typical output in CMAKE_OPENCV_GCC_VERSION_FULL: "c+//0 (whatever) 4.2.3 (...)" # Typical output in CMAKE_OPENCV_GCC_VERSION_FULL: "c+//0 (whatever) 4.2.3 (...)"
# Look for the version number # Look for the version number
string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}")
if(NOT CMAKE_GCC_REGEX_VERSION) if(NOT CMAKE_GCC_REGEX_VERSION)
string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}")
endif() endif()
# Split the three parts: # Split the three parts:
string(REGEX MATCHALL "[0-9]+" CMAKE_OPENCV_GCC_VERSIONS "${CMAKE_GCC_REGEX_VERSION}") string(REGEX MATCHALL "[0-9]+" CMAKE_OPENCV_GCC_VERSIONS "${CMAKE_GCC_REGEX_VERSION}")
list(GET CMAKE_OPENCV_GCC_VERSIONS 0 CMAKE_OPENCV_GCC_VERSION_MAJOR) list(GET CMAKE_OPENCV_GCC_VERSIONS 0 CMAKE_OPENCV_GCC_VERSION_MAJOR)
list(GET CMAKE_OPENCV_GCC_VERSIONS 1 CMAKE_OPENCV_GCC_VERSION_MINOR) list(GET CMAKE_OPENCV_GCC_VERSIONS 1 CMAKE_OPENCV_GCC_VERSION_MINOR)
set(CMAKE_OPENCV_GCC_VERSION ${CMAKE_OPENCV_GCC_VERSION_MAJOR}${CMAKE_OPENCV_GCC_VERSION_MINOR}) set(CMAKE_OPENCV_GCC_VERSION ${CMAKE_OPENCV_GCC_VERSION_MAJOR}${CMAKE_OPENCV_GCC_VERSION_MINOR})
math(EXPR CMAKE_OPENCV_GCC_VERSION_NUM "${CMAKE_OPENCV_GCC_VERSION_MAJOR}*100 + ${CMAKE_OPENCV_GCC_VERSION_MINOR}") math(EXPR CMAKE_OPENCV_GCC_VERSION_NUM "${CMAKE_OPENCV_GCC_VERSION_MAJOR}*100 + ${CMAKE_OPENCV_GCC_VERSION_MINOR}")
message(STATUS "Detected version of GNU GCC: ${CMAKE_OPENCV_GCC_VERSION} (${CMAKE_OPENCV_GCC_VERSION_NUM})") message(STATUS "Detected version of GNU GCC: ${CMAKE_OPENCV_GCC_VERSION} (${CMAKE_OPENCV_GCC_VERSION_NUM})")
if(WIN32) if(WIN32)
execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE
OUTPUT_STRIP_TRAILING_WHITESPACE) OUTPUT_STRIP_TRAILING_WHITESPACE)
if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
set(MINGW64 1) set(MINGW64 1)
endif()
endif() endif()
endif()
endif() endif()
if(MINGW64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR CMAKE_GENERATOR MATCHES "Visual Studio.*Win64") if(MSVC64 OR MINGW64)
set(X86_64 1) set(X86_64 1)
elseif(MSVC AND NOT CMAKE_CROSSCOMPILING)
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(X86_64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1) set(X86 1)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*|ARM.*") elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*|ARM.*")
set(ARM 1) set(ARM 1)
endif() endif()

View File

@ -4,7 +4,7 @@ if(APPLE)
set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory") set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory")
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
else(APPLE) else(APPLE)
find_package(OpenCL QUIET) #find_package(OpenCL QUIET)
if (NOT OPENCL_FOUND) if (NOT OPENCL_FOUND)
find_path(OPENCL_ROOT_DIR find_path(OPENCL_ROOT_DIR

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys, glob import sys, glob

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import os, sys, fnmatch, re import os, sys, fnmatch, re

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
ocv domain, a modified copy of sphinx.domains.cpp + shpinx.domains.python. ocv domain, a modified copy of sphinx.domains.cpp + shpinx.domains.python.

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys import sys

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
"""gen_pattern.py """gen_pattern.py
To run: To run:

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
# svgfig.py copyright (C) 2008 Jim Pivarski <jpivarski@gmail.com> # svgfig.py copyright (C) 2008 Jim Pivarski <jpivarski@gmail.com>
# #

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import os, sys, re import os, sys, re

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys import sys
import os.path import os.path

View File

@ -455,7 +455,7 @@ protected:
TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); } TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); }
*/ */
TEST(Core_globbing, accurasy) TEST(Core_globbing, accuracy)
{ {
std::string patternLena = cvtest::TS::ptr()->get_data_path() + "lena*.*"; std::string patternLena = cvtest::TS::ptr()->get_data_path() + "lena*.*";
std::string patternLenaPng = cvtest::TS::ptr()->get_data_path() + "lena.png"; std::string patternLenaPng = cvtest::TS::ptr()->get_data_path() + "lena.png";

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys, re import sys, re

View File

@ -154,6 +154,11 @@ the symptoms were damaged image and 'Corrupt JPEG data: premature end of data se
- USE_TEMP_BUFFER fixes the main problem (improper buffer management) and - USE_TEMP_BUFFER fixes the main problem (improper buffer management) and
prevents bad images in the first place prevents bad images in the first place
11th patch: April 2, 2013, Forrest Reiling forrest.reiling@gmail.com
Added v4l2 support for getting capture property CV_CAP_PROP_POS_MSEC.
Returns the millisecond timestamp of the last frame grabbed or 0 if no frames have been grabbed
Used to successfully synchonize 2 Logitech C310 USB webcams to within 16 ms of one another
make & enjoy! make & enjoy!
@ -320,6 +325,8 @@ typedef struct CvCaptureCAM_V4L
struct v4l2_queryctrl queryctrl; struct v4l2_queryctrl queryctrl;
struct v4l2_querymenu querymenu; struct v4l2_querymenu querymenu;
struct timeval timestamp;
/* V4L2 control variables */ /* V4L2 control variables */
int v4l2_brightness, v4l2_brightness_min, v4l2_brightness_max; int v4l2_brightness, v4l2_brightness_min, v4l2_brightness_max;
int v4l2_contrast, v4l2_contrast_min, v4l2_contrast_max; int v4l2_contrast, v4l2_contrast_min, v4l2_contrast_max;
@ -836,6 +843,9 @@ static int _capture_V4L2 (CvCaptureCAM_V4L *capture, char *deviceName)
capture->v4l2_gain_max = 0; capture->v4l2_gain_max = 0;
capture->v4l2_exposure_max = 0; capture->v4l2_exposure_max = 0;
capture->timestamp.tv_sec = 0;
capture->timestamp.tv_usec = 0;
/* Scan V4L2 controls */ /* Scan V4L2 controls */
v4l2_scan_controls(capture); v4l2_scan_controls(capture);
@ -1221,6 +1231,9 @@ static int read_frame_v4l2(CvCaptureCAM_V4L* capture) {
if (-1 == ioctl (capture->deviceHandle, VIDIOC_QBUF, &buf)) if (-1 == ioctl (capture->deviceHandle, VIDIOC_QBUF, &buf))
perror ("VIDIOC_QBUF"); perror ("VIDIOC_QBUF");
//set timestamp in capture struct to be timestamp of most recent frame
capture->timestamp = buf.timestamp;
return 1; return 1;
} }
@ -2308,6 +2321,13 @@ static double icvGetPropertyCAM_V4L (CvCaptureCAM_V4L* capture,
/* initialize the control structure */ /* initialize the control structure */
switch (property_id) { switch (property_id) {
case CV_CAP_PROP_POS_MSEC:
if (capture->FirstCapture) {
return 0;
} else {
return 1000 * capture->timestamp.tv_sec + ((double) capture->timestamp.tv_usec) / 1000;
}
break;
case CV_CAP_PROP_BRIGHTNESS: case CV_CAP_PROP_BRIGHTNESS:
capture->control.id = V4L2_CID_BRIGHTNESS; capture->control.id = V4L2_CID_BRIGHTNESS;
break; break;

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys, os, re import sys, os, re

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys, re, os.path import sys, re, os.path
from string import Template from string import Template

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import os, sys, re, string, glob import os, sys, re, string, glob
from optparse import OptionParser from optparse import OptionParser

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import os, sys, re, string, fnmatch import os, sys, re, string, fnmatch
allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "gpu", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "softcascade", "superres"] allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "gpu", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "softcascade", "superres"]

View File

@ -76,7 +76,7 @@ namespace cv
size_t wave_size = 0; size_t wave_size = 0;
queryDeviceInfo(WAVEFRONT_SIZE, &wave_size); queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast<int>(wave_size)); std::sprintf(pSURF_OPTIONS, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
OPTION_INIT = true; OPTION_INIT = true;
} }
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS); openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS);

View File

@ -127,8 +127,9 @@ namespace cv
// currently only support wavefront size queries // currently only support wavefront size queries
enum DEVICE_INFO enum DEVICE_INFO
{ {
WAVEFRONT_SIZE, //in AMD speak WAVEFRONT_SIZE, //in AMD speak
WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak WARP_SIZE = WAVEFRONT_SIZE, //in nvidia speak
IS_CPU_DEVICE //check if the device is CPU
}; };
//info should have been pre-allocated //info should have been pre-allocated
void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info); void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info);

View File

@ -91,9 +91,6 @@ namespace cv
extern const char *arithm_bitwise_xor_scalar_mask; extern const char *arithm_bitwise_xor_scalar_mask;
extern const char *arithm_compare_eq; extern const char *arithm_compare_eq;
extern const char *arithm_compare_ne; extern const char *arithm_compare_ne;
extern const char *arithm_sub;
extern const char *arithm_sub_scalar;
extern const char *arithm_sub_scalar_mask;
extern const char *arithm_mul; extern const char *arithm_mul;
extern const char *arithm_div; extern const char *arithm_div;
extern const char *arithm_absdiff; extern const char *arithm_absdiff;
@ -260,11 +257,11 @@ void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const ocl
void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst) void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst)
{ {
arithmetic_run(src1, src2, dst, "arithm_sub", &arithm_sub); arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add);
} }
void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
{ {
arithmetic_run(src1, src2, dst, mask, "arithm_sub_with_mask", &arithm_sub); arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add);
} }
typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName,
const char **kernelString, void *scalar); const char **kernelString, void *scalar);
@ -451,14 +448,16 @@ void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const ocl
void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
{ {
String kernelName = mask.data ? "arithm_s_sub_with_mask" : "arithm_s_sub"; String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
const char **kernelString = mask.data ? &arithm_sub_scalar_mask : &arithm_sub_scalar; const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1); arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1);
} }
void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask) void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask)
{ {
String kernelName = mask.data ? "arithm_s_sub_with_mask" : "arithm_s_sub"; String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
const char **kernelString = mask.data ? &arithm_sub_scalar_mask : &arithm_sub_scalar; const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1); arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1);
} }
void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst) void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)

View File

@ -394,6 +394,15 @@ namespace cv
} }
break; break;
case IS_CPU_DEVICE:
{
cl_device_type devicetype;
openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum],
CL_DEVICE_TYPE, sizeof(cl_device_type),
&devicetype, NULL));
*(bool*)info = (devicetype == CVCL_DEVICE_TYPE_CPU);
}
break;
default: default:
CV_Error(-1, "Invalid device info type"); CV_Error(-1, "Invalid device info type");
break; break;

View File

@ -393,7 +393,7 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
if( rtype < 0 ) if( rtype < 0 )
rtype = type(); rtype = type();
else else
rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), oclchannels());
//int scn = channels(); //int scn = channels();
int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype); int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);

View File

@ -52,6 +52,11 @@
#endif #endif
#endif #endif
#ifdef ARITHM_ADD
#define ARITHM_OP(A,B) ((A)+(B))
#elif defined ARITHM_SUB
#define ARITHM_OP(A,B) ((A)-(B))
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////ADD//////////////////////////////////////////////////// /////////////////////////////////////////////ADD////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
@ -95,7 +100,7 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
uchar4 tmp_data = convert_uchar4_sat(tmp); uchar4 tmp_data = convert_uchar4_sat(tmp);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -134,7 +139,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) + convert_int4_sat(src2_data); int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
ushort4 tmp_data = convert_ushort4_sat(tmp); ushort4 tmp_data = convert_ushort4_sat(tmp);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -172,7 +177,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) + convert_int4_sat(src2_data); int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
short4 tmp_data = convert_short4_sat(tmp); short4 tmp_data = convert_short4_sat(tmp);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -200,7 +205,7 @@ __kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset,
int data1 = *((__global int *)((__global char *)src1 + src1_index)); int data1 = *((__global int *)((__global char *)src1 + src1_index));
int data2 = *((__global int *)((__global char *)src2 + src2_index)); int data2 = *((__global int *)((__global char *)src2 + src2_index));
long tmp = (long)(data1) + (long)(data2); long tmp = ARITHM_OP((long)(data1), (long)(data2));
*((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp); *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
} }
@ -221,7 +226,7 @@ __kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offse
float data1 = *((__global float *)((__global char *)src1 + src1_index)); float data1 = *((__global float *)((__global char *)src1 + src1_index));
float data2 = *((__global float *)((__global char *)src2 + src2_index)); float data2 = *((__global float *)((__global char *)src2 + src2_index));
float tmp = data1 + data2; float tmp = ARITHM_OP(data1, data2);
*((__global float *)((__global char *)dst + dst_index)) = tmp; *((__global float *)((__global char *)dst + dst_index)) = tmp;
} }
@ -245,7 +250,7 @@ __kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offs
double data1 = *((__global double *)((__global char *)src1 + src1_index)); double data1 = *((__global double *)((__global char *)src1 + src1_index));
double data2 = *((__global double *)((__global char *)src2 + src2_index)); double data2 = *((__global double *)((__global char *)src2 + src2_index));
*((__global double *)((__global char *)dst + dst_index)) = data1 + data2; *((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2);
} }
} }
#endif #endif
@ -302,7 +307,7 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
} }
uchar4 data = *((__global uchar4 *)(dst + dst_index)); uchar4 data = *((__global uchar4 *)(dst + dst_index));
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
uchar4 tmp_data = convert_uchar4_sat(tmp); uchar4 tmp_data = convert_uchar4_sat(tmp);
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
@ -344,7 +349,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
uchar2 mask_data = vload2(0, mask + mask_index); uchar2 mask_data = vload2(0, mask + mask_index);
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
int2 tmp = convert_int2_sat(src1_data) + convert_int2_sat(src2_data); int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
ushort2 tmp_data = convert_ushort2_sat(tmp); ushort2 tmp_data = convert_ushort2_sat(tmp);
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
@ -384,7 +389,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
uchar2 mask_data = vload2(0, mask + mask_index); uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
int2 tmp = convert_int2_sat(src1_data) + convert_int2_sat(src2_data); int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
short2 tmp_data = convert_short2_sat(tmp); short2 tmp_data = convert_short2_sat(tmp);
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
@ -416,7 +421,7 @@ __kernel void arithm_add_with_mask_C1_D4 (__global int *src1, int src1_step, i
int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
int dst_data = *((__global int *)((__global char *)dst + dst_index)); int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = convert_int_sat((long)src_data1 + (long)src_data2); int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data; *((__global int *)((__global char *)dst + dst_index)) = data;
@ -446,7 +451,7 @@ __kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, i
float src_data2 = *((__global float *)((__global char *)src2 + src2_index)); float src_data2 = *((__global float *)((__global char *)src2 + src2_index));
float dst_data = *((__global float *)((__global char *)dst + dst_index)); float dst_data = *((__global float *)((__global char *)dst + dst_index));
float data = src_data1 + src_data2; float data = ARITHM_OP(src_data1, src_data2);
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global float *)((__global char *)dst + dst_index)) = data; *((__global float *)((__global char *)dst + dst_index)) = data;
@ -477,7 +482,7 @@ __kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step,
double src_data2 = *((__global double *)((__global char *)src2 + src2_index)); double src_data2 = *((__global double *)((__global char *)src2 + src2_index));
double dst_data = *((__global double *)((__global char *)dst + dst_index)); double dst_data = *((__global double *)((__global char *)dst + dst_index));
double data = src_data1 + src_data2; double data = ARITHM_OP(src_data1, src_data2);
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global double *)((__global char *)dst + dst_index)) = data; *((__global double *)((__global char *)dst + dst_index)) = data;
@ -516,7 +521,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
uchar2 mask_data = vload2(0, mask + mask_index); uchar2 mask_data = vload2(0, mask + mask_index);
uchar4 data = *((__global uchar4 *)(dst + dst_index)); uchar4 data = *((__global uchar4 *)(dst + dst_index));
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
uchar4 tmp_data = convert_uchar4_sat(tmp); uchar4 tmp_data = convert_uchar4_sat(tmp);
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
@ -548,7 +553,7 @@ __kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step,
ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2); int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
ushort2 data = convert_ushort2_sat(tmp); ushort2 data = convert_ushort2_sat(tmp);
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
@ -578,7 +583,7 @@ __kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, i
short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2); int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
short2 data = convert_short2_sat(tmp); short2 data = convert_short2_sat(tmp);
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
@ -608,7 +613,7 @@ __kernel void arithm_add_with_mask_C2_D4 (__global int *src1, int src1_step, i
int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2)); int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data; *((__global int2 *)((__global char *)dst + dst_index)) = data;
@ -637,7 +642,7 @@ __kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, i
float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index)); float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index));
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
float2 data = src_data1 + src_data2; float2 data = ARITHM_OP(src_data1, src_data2);
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global float2 *)((__global char *)dst + dst_index)) = data; *((__global float2 *)((__global char *)dst + dst_index)) = data;
@ -668,329 +673,14 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step,
double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index)); double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index));
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
double2 data = src_data1 + src_data2; double2 data = ARITHM_OP(src_data1, src_data2);
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global double2 *)((__global char *)dst + dst_index)) = data; *((__global double2 *)((__global char *)dst + dst_index)) = data;
} }
} }
#endif #endif
__kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
uchar4 mask_data = vload4(0, mask + mask_index);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) + convert_short4_sat(src2_data_0));
uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) + convert_short4_sat(src2_data_1));
uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2));
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
uchar2 mask_data = vload2(0, mask + mask_index);
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0));
ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1));
ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2));
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0));
short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1));
short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2));
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
uchar mask_data = * (mask + mask_index);
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0);
int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global float *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
uchar mask_data = * (mask + mask_index);
float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
float tmp_data_0 = src1_data_0 + src2_data_0;
float tmp_data_1 = src1_data_1 + src2_data_1;
float tmp_data_2 = src1_data_2 + src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global double *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
uchar mask_data = * (mask + mask_index);
double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
double tmp_data_0 = src1_data_0 + src2_data_0;
double tmp_data_1 = src1_data_1 + src2_data_1;
double tmp_data_2 = src1_data_2 + src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset, __global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset, __global uchar *mask, int mask_step, int mask_offset,
@ -1014,7 +704,7 @@ __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = convert_uchar4_sat(convert_ushort4_sat(src_data1) + convert_ushort4_sat(src_data2)); uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2)));
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data; *((__global uchar4 *)(dst + dst_index)) = data;
@ -1043,7 +733,7 @@ __kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step,
ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2)); ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data; *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
@ -1072,7 +762,7 @@ __kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, i
short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = convert_short4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2)); short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data; *((__global short4 *)((__global char *)dst + dst_index)) = data;
@ -1101,7 +791,7 @@ __kernel void arithm_add_with_mask_C4_D4 (__global int *src1, int src1_step, i
int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src_data2)); int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2)));
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data; *((__global int4 *)((__global char *)dst + dst_index)) = data;
@ -1130,7 +820,7 @@ __kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, i
float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index)); float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index));
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 data = src_data1 + src_data2; float4 data = ARITHM_OP(src_data1, src_data2);
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global float4 *)((__global char *)dst + dst_index)) = data; *((__global float4 *)((__global char *)dst + dst_index)) = data;
@ -1161,7 +851,7 @@ __kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step,
double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index)); double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index));
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
double4 data = src_data1 + src_data2; double4 data = ARITHM_OP(src_data1, src_data2);
data = mask_data ? data : dst_data; data = mask_data ? data : dst_data;
*((__global double4 *)((__global char *)dst + dst_index)) = data; *((__global double4 *)((__global char *)dst + dst_index)) = data;

View File

@ -330,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src
if (x < thread_cols && y < rows) if (x < thread_cols && y < rows)
{ {
int src_index_0 = mad24(y, src_step, (x) + src_offset); int src_index_0 = mad24(y, src_step, (x) + src_offset);
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset); int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
uchar data0 = *(src + src_index_0); uchar data0 = *(src + src_index_0);
uchar data1 = *(src + src_index_1);
*(dst + dst_index_0) = data1;
*(dst + dst_index_1) = data0; *(dst + dst_index_1) = data0;
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
uchar data1 = *(src + src_index_1);
*(dst + dst_index_0) = data1;
} }
} }
__kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset, __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset,

File diff suppressed because it is too large Load Diff

View File

@ -1,806 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
/**************************************sub with scalar without mask**************************************/
__kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) - src2_data;
tmp = isMatSubScalar ? tmp : -tmp;
uchar4 tmp_data = convert_uchar4_sat(tmp);
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
int2 src2_data = (int2)(src2.x, src2.x);
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
int2 tmp = convert_int2_sat(src1_data) - src2_data;
tmp = isMatSubScalar ? tmp : -tmp;
ushort2 tmp_data = convert_ushort2_sat(tmp);
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
int2 src2_data = (int2)(src2.x, src2.x);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
int2 tmp = convert_int2_sat(src1_data) - src2_data;
tmp = isMatSubScalar ? tmp : -tmp;
short2 tmp_data = convert_short2_sat(tmp);
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
int src_data2 = src2.x;
long tmp = (long)src_data1 - (long)src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
int data = convert_int_sat(tmp);
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C1_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
float src_data2 = src2.x;
float tmp = src_data1 - src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
*((__global float *)((__global char *)dst + dst_index)) = tmp;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_sub_C1_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
double src2_data = src2.x;
double data = src_data1 - src2_data;
data = isMatSubScalar ? data : -data;
*((__global double *)((__global char *)dst + dst_index)) = data;
}
}
#endif
__kernel void arithm_s_sub_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) - src2_data;
tmp = isMatSubScalar ? tmp : -tmp;
uchar4 tmp_data = convert_uchar4_sat(tmp);
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) - src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
ushort2 data = convert_ushort2_sat(tmp);
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) - src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
short2 data = convert_short2_sat(tmp);
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
tmp = isMatSubScalar ? tmp : -tmp;
int2 data = convert_int2_sat(tmp);
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C2_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
float2 src_data2 = (float2)(src2.x, src2.y);
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
float2 tmp = src_data1 - src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
*((__global float2 *)((__global char *)dst + dst_index)) = tmp;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_sub_C2_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
double2 src_data2 = (double2)(src2.x, src2.y);
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
double2 data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
*((__global double2 *)((__global char *)dst + dst_index)) = data;
}
}
#endif
__kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0;
int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1;
int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0);
uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1);
uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0);
ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1);
ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2);
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
short2 tmp_data_0 = convert_short2_sat(tmp_0);
short2 tmp_data_1 = convert_short2_sat(tmp_1);
short2 tmp_data_2 = convert_short2_sat(tmp_2);
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_sub_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = src2.x;
int src2_data_1 = src2.y;
int src2_data_2 = src2.z;
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
int tmp_data_0 = convert_int_sat(tmp_0);
int tmp_data_1 = convert_int_sat(tmp_1);
int tmp_data_2 = convert_int_sat(tmp_2);
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_sub_C3_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = src2.x;
float src2_data_1 = src2.y;
float src2_data_2 = src2.z;
float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
float tmp_0 = src1_data_0 - src2_data_0;
float tmp_1 = src1_data_1 - src2_data_1;
float tmp_2 = src1_data_2 - src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_0;
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_1;
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_sub_C3_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = src2.x;
double src2_data_1 = src2.y;
double src2_data_2 = src2.z;
double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
double tmp_data_0 = src1_data_0 - src2_data_0;
double tmp_data_1 = src1_data_1 - src2_data_1;
double tmp_data_2 = src1_data_2 - src2_data_2;
tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif
__kernel void arithm_s_sub_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
int4 tmp = convert_int4_sat(src_data1) - src2;
tmp = isMatSubScalar ? tmp : -tmp;
uchar4 data = convert_uchar4_sat(tmp);
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
int4 tmp = convert_int4_sat(src_data1) - src2;
tmp = isMatSubScalar ? tmp : -tmp;
ushort4 data = convert_ushort4_sat(tmp);
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
int4 tmp = convert_int4_sat(src_data1) - src2;
tmp = isMatSubScalar ? tmp : -tmp;
short4 data = convert_short4_sat(tmp);
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2);
tmp = isMatSubScalar ? tmp : -tmp;
int4 data = convert_int4_sat(tmp);
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_C4_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
float4 tmp = src_data1 - src2;
tmp = isMatSubScalar ? tmp : -tmp;
*((__global float4 *)((__global char *)dst + dst_index)) = tmp;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_sub_C4_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset);
double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
double4 data = src_data1 - src2;
data = isMatSubScalar ? data : -data;
*((__global double4 *)((__global char *)dst + dst_index)) = data;
}
}
#endif

View File

@ -1,941 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
/**************************************sub with scalar with mask**************************************/
__kernel void arithm_s_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
uchar4 mask_data = vload4(0, mask + mask_index);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) - src2_data;
tmp = isMatSubScalar ? tmp : -tmp;
uchar4 tmp_data = convert_uchar4_sat(tmp);
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
int2 src2_data = (int2)(src2.x, src2.x);
uchar2 mask_data = vload2(0, mask + mask_index);
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
int2 tmp = convert_int2_sat(src1_data) - src2_data;
tmp = isMatSubScalar ? tmp : -tmp;
ushort2 tmp_data = convert_ushort2_sat(tmp);
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
int2 src2_data = (int2)(src2.x, src2.x);
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
int2 tmp = convert_int2_sat(src1_data) - src2_data;
tmp = isMatSubScalar ? tmp : -tmp;
short2 tmp_data = convert_short2_sat(tmp);
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
uchar mask_data = *(mask + mask_index);
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
int src_data2 = src2.x;
int dst_data = *((__global int *)((__global char *)dst + dst_index));
long tmp = (long)src_data1 - (long)src_data2;
tmp = isMatSubScalar ? tmp : - tmp;
int data = convert_int_sat(tmp);
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
uchar mask_data = *(mask + mask_index);
float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
float src_data2 = src2.x;
float dst_data = *((__global float *)((__global char *)dst + dst_index));
float data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
*((__global float *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_sub_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
uchar mask_data = *(mask + mask_index);
double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
double src_data2 = src2.x;
double dst_data = *((__global double *)((__global char *)dst + dst_index));
double data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
*((__global double *)((__global char *)dst + dst_index)) = data;
}
}
#endif
__kernel void arithm_s_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
uchar2 mask_data = vload2(0, mask + mask_index);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) - src2_data;
tmp = isMatSubScalar ? tmp : -tmp;
uchar4 tmp_data = convert_uchar4_sat(tmp);
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
uchar mask_data = *(mask + mask_index);
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) - src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
ushort2 data = convert_ushort2_sat(tmp);
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
uchar mask_data = *(mask + mask_index);
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) - src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
short2 data = convert_short2_sat(tmp);
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
uchar mask_data = *(mask + mask_index);
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
tmp = isMatSubScalar ? tmp : -tmp;
int2 data = convert_int2_sat(tmp);
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
uchar mask_data = *(mask + mask_index);
float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
float2 src_data2 = (float2)(src2.x, src2.y);
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
float2 data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
*((__global float2 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_sub_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
uchar mask_data = *(mask + mask_index);
double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
double2 src_data2 = (double2)(src2.x, src2.y);
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
double2 data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
*((__global double2 *)((__global char *)dst + dst_index)) = data;
}
}
#endif
__kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0;
int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1;
int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0);
uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1);
uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
uchar2 mask_data = vload2(0, mask + mask_index);
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0);
ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1);
ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2);
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
short2 tmp_data_0 = convert_short2_sat(tmp_0);
short2 tmp_data_1 = convert_short2_sat(tmp_1);
short2 tmp_data_2 = convert_short2_sat(tmp_2);
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = src2.x;
int src2_data_1 = src2.y;
int src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
int tmp_data_0 = convert_int_sat(tmp_0);
int tmp_data_1 = convert_int_sat(tmp_1);
int tmp_data_2 = convert_int_sat(tmp_2);
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = src2.x;
float src2_data_1 = src2.y;
float src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
float tmp_data_0 = src1_data_0 - src2_data_0;
float tmp_data_1 = src1_data_1 - src2_data_1;
float tmp_data_2 = src1_data_2 - src2_data_2;
tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = src2.x;
double src2_data_1 = src2.y;
double src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
double tmp_data_0 = src1_data_0 - src2_data_0;
double tmp_data_1 = src1_data_1 - src2_data_1;
double tmp_data_2 = src1_data_2 - src2_data_2;
tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
uchar mask_data = *(mask + mask_index);
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src_data1) - src2;
tmp = isMatSubScalar ? tmp : -tmp;
uchar4 data = convert_uchar4_sat(tmp);
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
uchar mask_data = *(mask + mask_index);
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
int4 tmp = convert_int4_sat(src_data1) - src2;
tmp = isMatSubScalar ? tmp : -tmp;
ushort4 data = convert_ushort4_sat(tmp);
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
uchar mask_data = *(mask + mask_index);
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
int4 tmp = convert_int4_sat(src_data1) - src2;
tmp = isMatSubScalar ? tmp : -tmp;
short4 data = convert_short4_sat(tmp);
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
uchar mask_data = *(mask + mask_index);
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2);
tmp = isMatSubScalar ? tmp : -tmp;
int4 data = convert_int4_sat(tmp);
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_sub_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
uchar mask_data = *(mask + mask_index);
float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 data = src_data1 - src2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
*((__global float4 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_sub_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset);
uchar mask_data = *(mask + mask_index);
double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
double4 data = src_data1 - src2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
*((__global double4 *)((__global char *)dst + dst_index)) = data;
}
}
#endif

View File

@ -1,5 +1,58 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
#define MAX_FLOAT 1e7f #define MAX_FLOAT 3.40282e+038f
#ifndef BLOCK_SIZE
#define BLOCK_SIZE 16
#endif
#ifndef MAX_DESC_LEN
#define MAX_DESC_LEN 64
#endif
int bit1Count(float x) int bit1Count(float x)
{ {
@ -13,83 +66,52 @@ int bit1Count(float x)
return (float)c; return (float)c;
} }
#ifndef DIST_TYPE
#define DIST_TYPE 0
#endif
#if (DIST_TYPE == 0)
#define DIST(x, y) fabs((x) - (y))
#elif (DIST_TYPE == 1)
#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
#elif (DIST_TYPE == 2)
#define DIST(x, y) bit1Count((uint)(x) ^ (uint)(y))
#endif
float reduce_block(__local float *s_query, float reduce_block(__local float *s_query,
__local float *s_train, __local float *s_train,
int block_size,
int lidx, int lidx,
int lidy, int lidy
int distType
) )
{ {
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
float result = 0; float result = 0;
switch(distType) #pragma unroll
for (int j = 0 ; j < BLOCK_SIZE ; j++)
{ {
case 0: result += DIST(s_query[lidy * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
}
break;
} }
return result; return result;
} }
float reduce_multi_block(__local float *s_query, float reduce_multi_block(__local float *s_query,
__local float *s_train, __local float *s_train,
int max_desc_len,
int block_size,
int block_index, int block_index,
int lidx, int lidx,
int lidy, int lidy
int distType
) )
{ {
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
float result = 0; float result = 0;
switch(distType) #pragma unroll
for (int j = 0 ; j < BLOCK_SIZE ; j++)
{ {
case 0: result += DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
}
break;
} }
return result; return result;
} }
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size /* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
local size: dim0 is block_size, dim1 is block_size. local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
*/ */
__kernel void BruteForceMatch_UnrollMatch_D5( __kernel void BruteForceMatch_UnrollMatch_D5(
__global float *query, __global float *query,
@ -98,29 +120,28 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows, int query_rows,
int query_cols, int query_cols,
int train_rows, int train_rows,
int train_cols, int train_cols,
int step, int step
int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
__local float *s_query = sharebuffer; __local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * max_desc_len; __local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
int queryIdx = groupidx * block_size + lidy; int queryIdx = groupidx * BLOCK_SIZE + lidy;
// load the query into local memory. // load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++) #pragma unroll
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
{ {
int loadx = lidx + i * block_size; int loadx = lidx + i * BLOCK_SIZE;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
} }
float myBestDistance = MAX_FLOAT; float myBestDistance = MAX_FLOAT;
@ -128,24 +149,25 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
// loopUnrolledCached to find the best trainIdx and best distance. // loopUnrolledCached to find the best trainIdx and best distance.
volatile int imgIdx = 0; volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
{ {
float result = 0; float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++) #pragma unroll
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
{ {
//load a block_size * block_size block into local train. //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
const int loadx = lidx + i * block_size; const int loadx = lidx + i * BLOCK_SIZE;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already. //synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
int trainIdx = t * block_size + lidx; int trainIdx = t * BLOCK_SIZE + lidx;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
{ {
@ -157,18 +179,19 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float*)(sharebuffer); __local float *s_distance = (__local float*)(sharebuffer);
__local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
//find BestMatch //find BestMatch
s_distance += lidy * block_size; s_distance += lidy * BLOCK_SIZE;
s_trainIdx += lidy * block_size; s_trainIdx += lidy * BLOCK_SIZE;
s_distance[lidx] = myBestDistance; s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx; s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads. //reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++) #pragma unroll
for (int k = 0 ; k < BLOCK_SIZE; k++)
{ {
if (myBestDistance > s_distance[k]) if (myBestDistance > s_distance[k])
{ {
@ -191,53 +214,51 @@ __kernel void BruteForceMatch_Match_D5(
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
int block_size,
int query_rows, int query_rows,
int query_cols, int query_cols,
int train_rows, int train_rows,
int train_cols, int train_cols,
int step, int step
int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy; const int queryIdx = groupidx * BLOCK_SIZE + lidy;
float myBestDistance = MAX_FLOAT; float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1; int myBestTrainIdx = -1;
__local float *s_query = sharebuffer; __local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size; __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
// loop // loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
{ {
//Dist dist; //Dist dist;
float result = 0; float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
{ {
const int loadx = lidx + i * block_size; const int loadx = lidx + i * BLOCK_SIZE;
//load query and train into local memory //load query and train into local memory
s_query[lidy * block_size + lidx] = 0; s_query[lidy * BLOCK_SIZE + lidx] = 0;
s_train[lidx * block_size + lidy] = 0; s_train[lidx * BLOCK_SIZE + lidy] = 0;
if (loadx < query_cols) if (loadx < query_cols)
{ {
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); result += reduce_block(s_query, s_train, lidx, lidy);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
const int trainIdx = t * block_size + lidx; const int trainIdx = t * BLOCK_SIZE + lidx;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
{ {
@ -250,18 +271,18 @@ __kernel void BruteForceMatch_Match_D5(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer; __local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
//findBestMatch //findBestMatch
s_distance += lidy * block_size; s_distance += lidy * BLOCK_SIZE;
s_trainIdx += lidy * block_size; s_trainIdx += lidy * BLOCK_SIZE;
s_distance[lidx] = myBestDistance; s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx; s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads. //reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++) for (int k = 0 ; k < BLOCK_SIZE; k++)
{ {
if (myBestDistance > s_distance[k]) if (myBestDistance > s_distance[k])
{ {
@ -287,16 +308,13 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
__global float *bestDistance, __global float *bestDistance,
__global int *nMatches, __global int *nMatches,
__local float *sharebuffer, __local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows, int query_rows,
int query_cols, int query_cols,
int train_rows, int train_rows,
int train_cols, int train_cols,
int bestTrainIdx_cols, int bestTrainIdx_cols,
int step, int step,
int ostep, int ostep
int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
@ -304,25 +322,25 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1); const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy; const int queryIdx = groupidy * BLOCK_SIZE + lidy;
const int trainIdx = groupidx * block_size + lidx; const int trainIdx = groupidx * BLOCK_SIZE + lidx;
__local float *s_query = sharebuffer; __local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size; __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
float result = 0; float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; ++i) for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
{ {
//load a block_size * block_size block into local train. //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
const int loadx = lidx + i * block_size; const int loadx = lidx + i * BLOCK_SIZE;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already. //synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); result += reduce_block(s_query, s_train, lidx, lidy);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
@ -350,15 +368,13 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
__global float *bestDistance, __global float *bestDistance,
__global int *nMatches, __global int *nMatches,
__local float *sharebuffer, __local float *sharebuffer,
int block_size,
int query_rows, int query_rows,
int query_cols, int query_cols,
int train_rows, int train_rows,
int train_cols, int train_cols,
int bestTrainIdx_cols, int bestTrainIdx_cols,
int step, int step,
int ostep, int ostep
int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
@ -366,25 +382,25 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1); const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy; const int queryIdx = groupidy * BLOCK_SIZE + lidy;
const int trainIdx = groupidx * block_size + lidx; const int trainIdx = groupidx * BLOCK_SIZE + lidx;
__local float *s_query = sharebuffer; __local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size; __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
float result = 0; float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
{ {
//load a block_size * block_size block into local train. //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
const int loadx = lidx + i * block_size; const int loadx = lidx + i * BLOCK_SIZE;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already. //synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); result += reduce_block(s_query, s_train, lidx, lidy);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
@ -410,29 +426,26 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
__global int2 *bestTrainIdx, __global int2 *bestTrainIdx,
__global float2 *bestDistance, __global float2 *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows, int query_rows,
int query_cols, int query_cols,
int train_rows, int train_rows,
int train_cols, int train_cols,
int step, int step
int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy; const int queryIdx = groupidx * BLOCK_SIZE + lidy;
local float *s_query = sharebuffer; local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * max_desc_len; local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
// load the query into local memory. // load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++) for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
{ {
int loadx = lidx + i * block_size; int loadx = lidx + i * BLOCK_SIZE;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
} }
float myBestDistance1 = MAX_FLOAT; float myBestDistance1 = MAX_FLOAT;
@ -442,25 +455,25 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
//loopUnrolledCached //loopUnrolledCached
volatile int imgIdx = 0; volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
{ {
float result = 0; float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++) for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
{ {
const int loadX = lidx + i * block_size; const int loadX = lidx + i * BLOCK_SIZE;
//load a block_size * block_size block into local train. //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
const int loadx = lidx + i * block_size; const int loadx = lidx + i * BLOCK_SIZE;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already. //synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
const int trainIdx = t * block_size + lidx; const int trainIdx = t * BLOCK_SIZE + lidx;
if (queryIdx < query_rows && trainIdx < train_rows) if (queryIdx < query_rows && trainIdx < train_rows)
{ {
@ -482,11 +495,11 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
local float *s_distance = (local float *)sharebuffer; local float *s_distance = (local float *)sharebuffer;
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
// find BestMatch // find BestMatch
s_distance += lidy * block_size; s_distance += lidy * BLOCK_SIZE;
s_trainIdx += lidy * block_size; s_trainIdx += lidy * BLOCK_SIZE;
s_distance[lidx] = myBestDistance1; s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1; s_trainIdx[lidx] = myBestTrainIdx1;
@ -499,7 +512,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
if (lidx == 0) if (lidx == 0)
{ {
for (int i = 0 ; i < block_size ; i++) for (int i = 0 ; i < BLOCK_SIZE ; i++)
{ {
float val = s_distance[i]; float val = s_distance[i];
if (val < bestDistance1) if (val < bestDistance1)
@ -527,7 +540,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
if (lidx == 0) if (lidx == 0)
{ {
for (int i = 0 ; i < block_size ; i++) for (int i = 0 ; i < BLOCK_SIZE ; i++)
{ {
float val = s_distance[i]; float val = s_distance[i];
@ -559,22 +572,20 @@ __kernel void BruteForceMatch_knnMatch_D5(
__global int2 *bestTrainIdx, __global int2 *bestTrainIdx,
__global float2 *bestDistance, __global float2 *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
int block_size,
int query_rows, int query_rows,
int query_cols, int query_cols,
int train_rows, int train_rows,
int train_cols, int train_cols,
int step, int step
int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy; const int queryIdx = groupidx * BLOCK_SIZE + lidy;
local float *s_query = sharebuffer; local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * block_size; local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
float myBestDistance1 = MAX_FLOAT; float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT; float myBestDistance2 = MAX_FLOAT;
@ -582,30 +593,30 @@ __kernel void BruteForceMatch_knnMatch_D5(
int myBestTrainIdx2 = -1; int myBestTrainIdx2 = -1;
//loop //loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
{ {
float result = 0.0f; float result = 0.0f;
for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++) for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
{ {
const int loadx = lidx + i * block_size; const int loadx = lidx + i * BLOCK_SIZE;
//load query and train into local memory //load query and train into local memory
s_query[lidy * block_size + lidx] = 0; s_query[lidy * BLOCK_SIZE + lidx] = 0;
s_train[lidx * block_size + lidy] = 0; s_train[lidx * BLOCK_SIZE + lidy] = 0;
if (loadx < query_cols) if (loadx < query_cols)
{ {
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); result += reduce_block(s_query, s_train, lidx, lidy);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
const int trainIdx = t * block_size + lidx; const int trainIdx = t * BLOCK_SIZE + lidx;
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
{ {
@ -627,11 +638,11 @@ __kernel void BruteForceMatch_knnMatch_D5(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer; __local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
//findBestMatch //findBestMatch
s_distance += lidy * block_size; s_distance += lidy * BLOCK_SIZE;
s_trainIdx += lidy * block_size; s_trainIdx += lidy * BLOCK_SIZE;
s_distance[lidx] = myBestDistance1; s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1; s_trainIdx[lidx] = myBestTrainIdx1;
@ -644,7 +655,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
if (lidx == 0) if (lidx == 0)
{ {
for (int i = 0 ; i < block_size ; i++) for (int i = 0 ; i < BLOCK_SIZE ; i++)
{ {
float val = s_distance[i]; float val = s_distance[i];
if (val < bestDistance1) if (val < bestDistance1)
@ -672,7 +683,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
if (lidx == 0) if (lidx == 0)
{ {
for (int i = 0 ; i < block_size ; i++) for (int i = 0 ; i < BLOCK_SIZE ; i++)
{ {
float val = s_distance[i]; float val = s_distance[i];
@ -703,14 +714,11 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5(
//__global float *mask, //__global float *mask,
__global float *allDist, __global float *allDist,
__local float *sharebuffer, __local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows, int query_rows,
int query_cols, int query_cols,
int train_rows, int train_rows,
int train_cols, int train_cols,
int step, int step)
int distType)
{ {
/* Todo */ /* Todo */
} }
@ -721,13 +729,11 @@ kernel void BruteForceMatch_calcDistance_D5(
//__global float *mask, //__global float *mask,
__global float *allDist, __global float *allDist,
__local float *sharebuffer, __local float *sharebuffer,
int block_size,
int query_rows, int query_rows,
int query_cols, int query_cols,
int train_rows, int train_rows,
int train_cols, int train_cols,
int step, int step)
int distType)
{ {
/* Todo */ /* Todo */
} }
@ -736,8 +742,7 @@ kernel void BruteForceMatch_findBestMatch_D5(
__global float *allDist, __global float *allDist,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
int k, int k
int block_size
) )
{ {
/* Todo */ /* Todo */

View File

@ -96,18 +96,18 @@ The info above maybe obsolete.
***********************************************************************************/ ***********************************************************************************/
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
(__global const uchar * restrict src, (__global const uchar * restrict src,
__global float * dst, __global float * dst,
const int dst_cols, const int dst_cols,
const int dst_rows, const int dst_rows,
const int src_whole_cols, const int src_whole_cols,
const int src_whole_rows, const int src_whole_rows,
const int src_step_in_pixel, const int src_step_in_pixel,
const int src_offset_x, const int src_offset_x,
const int src_offset_y, const int src_offset_y,
const int dst_step_in_pixel, const int dst_step_in_pixel,
const int radiusy, const int radiusy,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
{ {
int x = get_global_id(0)<<2; int x = get_global_id(0)<<2;
int y = get_global_id(1); int y = get_global_id(1);
@ -122,17 +122,17 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
uchar4 temp[READ_TIMES_ROW]; uchar4 temp[READ_TIMES_ROW];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
int current_addr = start_addr+i*LSIZE0*4; int current_addr = start_addr+i*LSIZE0*4;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = *(__global uchar4*)&src[current_addr]; temp[i] = *(__global uchar4*)&src[current_addr];
} }
//judge if read out of boundary //judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x); temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y); temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
@ -140,7 +140,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w); temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]); temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
} }
#else #else
int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
int4 index[READ_TIMES_ROW]; int4 index[READ_TIMES_ROW];
int4 addr; int4 addr;
@ -148,7 +148,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
if(not_all_in_range) if(not_all_in_range)
{ {
//judge if read out of boundary //judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4); index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x); index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
@ -162,7 +162,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
s_y= ADDR_L(start_y,0,src_whole_rows,start_y); s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y); s_y= ADDR_R(start_y,src_whole_rows,s_y);
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]); addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
temp[i].x = src[addr.x]; temp[i].x = src[addr.x];
@ -174,15 +174,15 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
else else
{ {
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4]; temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
} }
} }
#endif #endif
//save pixels to lds //save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
} }
@ -190,7 +190,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
//read pixels from lds and calculate the result //read pixels from lds and calculate the result
sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX]; sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++) for(i=1; i<=RADIUSX; i++)
{ {
temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i); temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i); temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
@ -219,18 +219,18 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
} }
} }
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
(__global const uchar4 * restrict src, (__global const uchar4 * restrict src,
__global float4 * dst, __global float4 * dst,
const int dst_cols, const int dst_cols,
const int dst_rows, const int dst_rows,
const int src_whole_cols, const int src_whole_cols,
const int src_whole_rows, const int src_whole_rows,
const int src_step_in_pixel, const int src_step_in_pixel,
const int src_offset_x, const int src_offset_x,
const int src_offset_y, const int src_offset_y,
const int dst_step_in_pixel, const int dst_step_in_pixel,
const int radiusy, const int radiusy,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -244,26 +244,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
uchar4 temp[READ_TIMES_ROW]; uchar4 temp[READ_TIMES_ROW];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
int current_addr = start_addr+i*LSIZE0; int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr]; temp[i] = src[current_addr];
} }
//judge if read out of boundary //judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]); temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]); temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
} }
#else #else
int index[READ_TIMES_ROW]; int index[READ_TIMES_ROW];
int s_x,s_y; int s_x,s_y;
//judge if read out of boundary //judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0); s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x); s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
@ -272,14 +272,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
index[i]=mad24(s_y,src_step_in_pixel,s_x); index[i]=mad24(s_y,src_step_in_pixel,s_x);
} }
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
temp[i] = src[index[i]]; temp[i] = src[index[i]];
} }
#endif #endif
//save pixels to lds //save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
} }
@ -287,7 +287,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
//read pixels from lds and calculate the result //read pixels from lds and calculate the result
sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX]; sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++) for(i=1; i<=RADIUSX; i++)
{ {
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]; temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]; temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
@ -302,18 +302,18 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
} }
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
(__global const float * restrict src, (__global const float * restrict src,
__global float * dst, __global float * dst,
const int dst_cols, const int dst_cols,
const int dst_rows, const int dst_rows,
const int src_whole_cols, const int src_whole_cols,
const int src_whole_rows, const int src_whole_rows,
const int src_step_in_pixel, const int src_step_in_pixel,
const int src_offset_x, const int src_offset_x,
const int src_offset_y, const int src_offset_y,
const int dst_step_in_pixel, const int dst_step_in_pixel,
const int radiusy, const int radiusy,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -327,26 +327,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
float temp[READ_TIMES_ROW]; float temp[READ_TIMES_ROW];
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; __local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
int current_addr = start_addr+i*LSIZE0; int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr]; temp[i] = src[current_addr];
} }
//judge if read out of boundary //judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]); temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(float)0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]); temp[i]= ELEM(start_y,0,src_whole_rows,(float)0,temp[i]);
} }
#else #else
int index[READ_TIMES_ROW]; int index[READ_TIMES_ROW];
int s_x,s_y; int s_x,s_y;
//judge if read out of boundary //judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0); s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x); s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
@ -355,14 +355,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
index[i]=mad24(s_y,src_step_in_pixel,s_x); index[i]=mad24(s_y,src_step_in_pixel,s_x);
} }
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
temp[i] = src[index[i]]; temp[i] = src[index[i]];
} }
#endif #endif
//save pixels to lds //save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
} }
@ -370,7 +370,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
//read pixels from lds and calculate the result //read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX]; sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++) for(i=1; i<=RADIUSX; i++)
{ {
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]; temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]; temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
@ -385,18 +385,18 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
} }
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
(__global const float4 * restrict src, (__global const float4 * restrict src,
__global float4 * dst, __global float4 * dst,
const int dst_cols, const int dst_cols,
const int dst_rows, const int dst_rows,
const int src_whole_cols, const int src_whole_cols,
const int src_whole_rows, const int src_whole_rows,
const int src_step_in_pixel, const int src_step_in_pixel,
const int src_offset_x, const int src_offset_x,
const int src_offset_y, const int src_offset_y,
const int dst_step_in_pixel, const int dst_step_in_pixel,
const int radiusy, const int radiusy,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -410,26 +410,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
float4 temp[READ_TIMES_ROW]; float4 temp[READ_TIMES_ROW];
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; __local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
int current_addr = start_addr+i*LSIZE0; int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr]; temp[i] = src[current_addr];
} }
//judge if read out of boundary //judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]); temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(float4)0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]); temp[i]= ELEM(start_y,0,src_whole_rows,(float4)0,temp[i]);
} }
#else #else
int index[READ_TIMES_ROW]; int index[READ_TIMES_ROW];
int s_x,s_y; int s_x,s_y;
//judge if read out of boundary //judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0); s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x); s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
@ -438,14 +438,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
index[i]=mad24(s_y,src_step_in_pixel,s_x); index[i]=mad24(s_y,src_step_in_pixel,s_x);
} }
//read pixels from src //read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
temp[i] = src[index[i]]; temp[i] = src[index[i]];
} }
#endif #endif
//save pixels to lds //save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++) for(i = 0; i<READ_TIMES_ROW; i++)
{ {
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
} }
@ -453,7 +453,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
//read pixels from lds and calculate the result //read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX]; sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++) for(i=1; i<=RADIUSX; i++)
{ {
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]; temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]; temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
@ -465,4 +465,5 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
start_addr = mad24(y,dst_step_in_pixel,x); start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum; dst[start_addr] = sum;
} }
} }

View File

@ -114,7 +114,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
int groupX_size = get_local_size(0); int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0); int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3) #define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
@ -125,7 +125,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
{ {
if((rows_start_index - src_offset_y) + i < rows + ANY) if((rows_start_index - src_offset_y) + i < rows + ANY)
{ {
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i; int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX; int selected_cols = cols_start_index_group + lX;
@ -143,7 +143,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
data = con ? data : 0; data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#else #else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
@ -162,7 +162,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
data = *(src + selected_row * src_step + selected_cols); data = *(src + selected_row * src_step + selected_cols);
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#endif #endif
} }
} }
} }
@ -185,17 +185,17 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
for(int i = 0; i < ANCHOR; i++) for(int i = 0; i < ANCHOR; i++)
{ {
#pragma unroll 3 #pragma unroll 3
for(int j = 0; j < ANCHOR; j++) for(int j = 0; j < ANCHOR; j++)
{ {
if(dst_rows_index < dst_rows_end) if(dst_rows_index < dst_rows_end)
{ {
int local_row = (lX >> THREADS_PER_ROW_BIT) + i; int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
} }
} }
} }
@ -207,7 +207,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w; sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum); *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
} }
} }
} }
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32FC1//////////////////////////////////////////////////////// /////////////////////////////////////////32FC1////////////////////////////////////////////////////////
@ -225,7 +225,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
int groupX_size = get_local_size(0); int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0); int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3) #define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
@ -236,7 +236,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
{ {
if((rows_start_index - src_offset_y) + i < rows + ANY) if((rows_start_index - src_offset_y) + i < rows + ANY)
{ {
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i; int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX; int selected_cols = cols_start_index_group + lX;
@ -254,7 +254,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
data = con ? data : 0; data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#else #else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
@ -272,7 +272,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#endif #endif
} }
} }
} }
@ -295,17 +295,17 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
for(int i = 0; i < ANCHOR; i++) for(int i = 0; i < ANCHOR; i++)
{ {
#pragma unroll 3 #pragma unroll 3
for(int j = 0; j < ANCHOR; j++) for(int j = 0; j < ANCHOR; j++)
{ {
if(dst_rows_index < dst_rows_end) if(dst_rows_index < dst_rows_end)
{ {
int local_row = (lX >> THREADS_PER_ROW_BIT) + i; int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
sum = sum + (mat_kernel[i * ANCHOR + j] * data); sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data);
} }
} }
} }
@ -318,7 +318,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum; *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
} }
} }
} }
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
@ -337,7 +337,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int groupX_size = get_local_size(0); int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0); int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3) #define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
@ -349,7 +349,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
{ {
if((rows_start_index - src_offset_y) + i < rows + ANY) if((rows_start_index - src_offset_y) + i < rows + ANY)
{ {
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i; int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX; int selected_cols = cols_start_index_group + lX;
@ -367,7 +367,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
data = con ? data : 0; data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#else #else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
@ -386,7 +386,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#endif #endif
} }
} }
} }
@ -410,17 +410,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
for(int i = 0; i < ANCHOR; i++) for(int i = 0; i < ANCHOR; i++)
{ {
#pragma unroll 3 #pragma unroll 3
for(int j = 0; j < ANCHOR; j++) for(int j = 0; j < ANCHOR; j++)
{ {
if(dst_rows_index < dst_rows_end) if(dst_rows_index < dst_rows_end)
{ {
int local_row = (lX >> THREADS_PER_ROW_BIT) + i; int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
} }
} }
} }
@ -468,7 +468,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
{ {
if((rows_start_index - src_offset_y) + i < rows + ANY) if((rows_start_index - src_offset_y) + i < rows + ANY)
{ {
#ifdef BORDER_CONSTANT #ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i; int selected_row = rows_start_index + i;
int selected_cols = cols_start_index_group + lX; int selected_cols = cols_start_index_group + lX;
@ -486,7 +486,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
data = con ? data : 0; data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
} }
#else #else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
@ -504,7 +504,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
} }
#endif #endif
} }
} }
} }
@ -519,10 +519,10 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
for(int i = 0; i < ANCHOR; i++) for(int i = 0; i < ANCHOR; i++)
{ {
for(int j = 0; j < ANCHOR; j++) for(int j = 0; j < ANCHOR; j++)
{ {
int local_cols = lX + j; int local_cols = lX + j;
sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]; sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]);
} }
} }

View File

@ -44,7 +44,11 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
#define LSIZE 256 #define LSIZE 256
#define LSIZE_1 255 #define LSIZE_1 255
@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
gid = gid << 1; gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1) for(int i = 0; i < rows; i =i + LSIZE_1)
{ {
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0); src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0); src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
if(lid > 0 && (i+lid) <= rows){ if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0]; lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1]; lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0]; lm_sqsum[0][bf_loc] += sqsum_t[0];
@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
src_step = src_step >> 4; src_step = src_step >> 4;
for(int i = 0; i < rows; i =i + LSIZE_1) for(int i = 0; i < rows; i =i + LSIZE_1)
{ {
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0; src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0;
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0; sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0; src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0;
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0; sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
@ -228,14 +233,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows) if(gid == 0 && (i + lid) <= rows)
{ {
sum[sum_offset + i + lid] = 0; sum[sum_offset + i + lid] = 0;
sqsum[sqsum_offset + i + lid] = 0; sqsum[sqsum_offset + i + lid] = 0;
} }
if(i + lid == 0) if(i + lid == 0)
{ {
int loc0 = gid * 2 * sum_step; int loc0 = gid * 2 * sum_step;
int loc1 = gid * 2 * sqsum_step; int loc1 = gid * 2 * sqsum_step;
for(int k = 1;k <= 8;k++) for(int k = 1; k <= 8; k++)
{ {
if(gid * 8 + k > cols) break; if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0; sum[sum_offset + loc0 + k * sum_step / 4] = 0;
@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
} }
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
if(lid > 0 && (i+lid) <= rows){ if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0]; lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1]; lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0]; lm_sqsum[0][bf_loc] += sqsum_t[0];

View File

@ -47,8 +47,12 @@
//warpAffine kernel //warpAffine kernel
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
#if defined DOUBLE_SUPPORT #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F; typedef double F;
typedef double4 F4; typedef double4 F4;
#define convert_F4 convert_double4 #define convert_F4 convert_double4
@ -58,7 +62,6 @@ typedef float4 F4;
#define convert_F4 convert_float4 #define convert_F4 convert_float4
#endif #endif
#define INTER_BITS 5 #define INTER_BITS 5
#define INTER_TAB_SIZE (1 << INTER_BITS) #define INTER_TAB_SIZE (1 << INTER_BITS)
#define INTER_SCALE 1.f/INTER_TAB_SIZE #define INTER_SCALE 1.f/INTER_TAB_SIZE
@ -81,8 +84,8 @@ inline void interpolateCubic( float x, float* coeffs )
/**********************************************8UC1********************************************* /**********************************************8UC1*********************************************
***********************************************************************************************/ ***********************************************************************************************/
__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -123,14 +126,14 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s1 = scon.s1 ? src[spos.s1] : 0;
sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0;
sval.s3 = scon.s3 ? src[spos.s3] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0;
dval = convert_uchar4(dcon != 0) ? sval : dval; dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
*d = dval; *d = dval;
} }
} }
__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows, __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -206,16 +209,16 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
taby = INTER_SCALE * convert_float4(ay); taby = INTER_SCALE * convert_float4(ay);
tabx = INTER_SCALE * convert_float4(ax); tabx = INTER_SCALE * convert_float4(ax);
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE ));
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
itab3 = convert_short4_sat(( taby*tabx * INTER_REMAP_COEF_SCALE )); itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE ));
int4 val; int4 val;
uchar4 tval; uchar4 tval;
val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
+ convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
__global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
@ -228,8 +231,8 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
} }
__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -255,10 +258,10 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
#pragma unroll 4 #pragma unroll 4
for(i=0; i<4; i++) for(i=0; i<4; i++)
for(j=0; j<4; j++) for(j=0; j<4; j++)
{ {
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
} }
short itab[16]; short itab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
@ -288,7 +291,7 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2; mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2; Mk1 = k1, Mk2 = k2;
} }
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
} }
@ -309,8 +312,8 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
***********************************************************************************************/ ***********************************************************************************************/
__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -333,8 +336,8 @@ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global
} }
__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -386,8 +389,8 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
} }
__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -418,10 +421,10 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
int i,j; int i,j;
#pragma unroll 4 #pragma unroll 4
for(i=0; i<4; i++) for(i=0; i<4; i++)
for(j=0; j<4; j++) for(j=0; j<4; j++)
{ {
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
} }
int itab[16]; int itab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
float axx, ayy; float axx, ayy;
@ -447,14 +450,14 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
int diff = isum - INTER_REMAP_COEF_SCALE; int diff = isum - INTER_REMAP_COEF_SCALE;
int Mk1=2, Mk2=2, mk1=2, mk2=2; int Mk1=2, Mk2=2, mk1=2, mk2=2;
for( k1 = 2; k1 < 4; k1++ ) for( k1 = 2; k1 < 4; k1++ )
for( k2 = 2; k2 < 4; k2++ ) for( k2 = 2; k2 < 4; k2++ )
{ {
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2; mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2; Mk1 = k1, Mk2 = k2;
} }
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
@ -477,8 +480,8 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
***********************************************************************************************/ ***********************************************************************************************/
__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -501,8 +504,8 @@ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int
} }
__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -548,12 +551,12 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx] = sum; dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
} }
} }
__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -617,8 +620,8 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
***********************************************************************************************/ ***********************************************************************************************/
__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -636,13 +639,13 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i
short sy0 = (short)(Y0 >> AB_BITS); short sy0 = (short)(Y0 >> AB_BITS);
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0; dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0;
} }
} }
__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -670,10 +673,10 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
float4 v0, v1, v2, v3; float4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
@ -691,12 +694,12 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[dst_offset+dy*dstStep+dx] = sum; dst[dst_offset+dy*dstStep+dx] = sum;
} }
} }
__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -726,7 +729,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
int i; int i;
for(i=0; i<16; i++) for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
float tab[16]; float tab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
@ -754,5 +757,5 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
dst[dst_offset+dy*dstStep+dx] = sum; dst[dst_offset+dy*dstStep+dx] = sum;
} }
} }
} }

View File

@ -47,8 +47,12 @@
//wrapPerspective kernel //wrapPerspective kernel
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
#if defined DOUBLE_SUPPORT #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F; typedef double F;
typedef double4 F4; typedef double4 F4;
#define convert_F4 convert_double4 #define convert_F4 convert_double4
@ -81,8 +85,8 @@ inline void interpolateCubic( float x, float* coeffs )
/**********************************************8UC1********************************************* /**********************************************8UC1*********************************************
***********************************************************************************************/ ***********************************************************************************************/
__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -112,14 +116,14 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s1 = scon.s1 ? src[spos.s1] : 0;
sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0;
sval.s3 = scon.s3 ? src[spos.s3] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0;
dval = convert_uchar4(dcon != 0) ? sval : dval; dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
*d = dval; *d = dval;
} }
} }
__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -142,7 +146,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
int i; int i;
#pragma unroll 4 #pragma unroll 4
for(i=0; i<4; i++) for(i=0; i<4; i++)
v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0; v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0;
short itab[4]; short itab[4];
float tab1y[2], tab1x[2]; float tab1y[2], tab1x[2];
@ -170,8 +174,8 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
} }
__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -190,15 +194,15 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
short ay = (short)(Y & (INTER_TAB_SIZE-1)); short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1)); short ax = (short)(X & (INTER_TAB_SIZE-1));
uchar v[16]; uchar v[16];
int i, j; int i, j;
#pragma unroll 4 #pragma unroll 4
for(i=0; i<4; i++) for(i=0; i<4; i++)
for(j=0; j<4; j++) for(j=0; j<4; j++)
{ {
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0;
} }
short itab[16]; short itab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
@ -227,7 +231,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2; mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2; Mk1 = k1, Mk2 = k2;
} }
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
} }
@ -249,8 +253,8 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
***********************************************************************************************/ ***********************************************************************************************/
__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -273,8 +277,8 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl
} }
__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -299,10 +303,10 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
int4 v0, v1, v2, v3; int4 v0, v1, v2, v3;
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0; v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0;
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0; v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0;
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0; v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0;
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0; v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0;
int itab0, itab1, itab2, itab3; int itab0, itab1, itab2, itab3;
float taby, tabx; float taby, tabx;
@ -323,8 +327,8 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
} }
__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -352,10 +356,10 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
int i,j; int i,j;
#pragma unroll 4 #pragma unroll 4
for(i=0; i<4; i++) for(i=0; i<4; i++)
for(j=0; j<4; j++) for(j=0; j<4; j++)
{ {
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
} }
int itab[16]; int itab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
float axx, ayy; float axx, ayy;
@ -381,14 +385,14 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
int diff = isum - INTER_REMAP_COEF_SCALE; int diff = isum - INTER_REMAP_COEF_SCALE;
int Mk1=2, Mk2=2, mk1=2, mk2=2; int Mk1=2, Mk2=2, mk1=2, mk2=2;
for( k1 = 2; k1 < 4; k1++ ) for( k1 = 2; k1 < 4; k1++ )
for( k2 = 2; k2 < 4; k2++ ) for( k2 = 2; k2 < 4; k2++ )
{ {
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2; mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2; Mk1 = k1, Mk2 = k2;
} }
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
@ -411,8 +415,8 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
***********************************************************************************************/ ***********************************************************************************************/
__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -434,8 +438,8 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst
} }
__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -458,10 +462,10 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
float v0, v1, v2, v3; float v0, v1, v2, v3;
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0; v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0;
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0; v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0;
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0; v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0;
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0; v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0;
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
@ -483,8 +487,8 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
} }
__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -510,7 +514,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
int i; int i;
for(i=0; i<16; i++) for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0;
float tab[16]; float tab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
@ -546,8 +550,8 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
***********************************************************************************************/ ***********************************************************************************************/
__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -564,13 +568,13 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
short sy = (short)Y; short sy = (short)Y;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0; dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
} }
} }
__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep, int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols ) int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -597,10 +601,10 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
float4 v0, v1, v2, v3; float4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
float tab[4]; float tab[4];
float taby[2], tabx[2]; float taby[2], tabx[2];
@ -622,8 +626,8 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
} }
__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
@ -652,7 +656,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
int i; int i;
for(i=0; i<16; i++) for(i=0; i<16; i++)
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
float tab[16]; float tab[16];
float tab1y[4], tab1x[4]; float tab1y[4], tab1x[4];
@ -680,5 +684,5 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
dst[dst_offset+dy*dstStep+dx] = sum; dst[dst_offset+dy*dstStep+dx] = sum;
} }
} }
} }

View File

@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
for(j = 0; j < tpl_cols; j ++) for(j = 0; j < tpl_cols; j ++)
{ {
sum = mad24(img_ptr[j], tpl_ptr[j], sum); sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
} }
} }
res[res_idx] = sum; res[res_idx] = (float)sum;
} }
} }
@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0
sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
} }
} }
res[res_idx] = sum.x + sum.y + sum.z + sum.w; res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
} }
} }
@ -633,9 +633,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0
if(gidx < res_cols && gidy < res_rows) if(gidx < res_cols && gidy < res_rows)
{ {
float sum = (float)( float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
- (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
res[res_idx] -= sum * tpl_sum; res[res_idx] -= sum * tpl_sum;
} }
} }

View File

@ -53,76 +53,96 @@
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
// Histogram computation // Histogram computation
// 12 threads for a cell, 12x4 threads per block
__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y, __kernel void compute_hists_kernel(
const int cnbins, const int cblock_hist_size, const int img_block_width, const int cblock_stride_x, const int cblock_stride_y,
const int grad_quadstep, const int qangle_step, const int cnbins, const int cblock_hist_size, const int img_block_width,
__global const float* grad, __global const uchar* qangle, const int blocks_in_group, const int blocks_total,
const float scale, __global float* block_hists, __local float* smem) const int grad_quadstep, const int qangle_step,
__global const float* grad, __global const uchar* qangle,
const float scale, __global float* block_hists, __local float* smem)
{ {
const int lidX = get_local_id(0); const int lx = get_local_id(0);
const int lp = lx / 24; /* local group id */
const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
const int gidY = gid / img_block_width;
const int gidX = gid - gidY * img_block_width;
const int lidX = lx - lp * 24;
const int lidY = get_local_id(1); const int lidY = get_local_id(1);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
const int cell_x = lidX / 16; const int cell_x = lidX / 12;
const int cell_y = lidY; const int cell_y = lidY;
const int cell_thread_x = lidX & 0xF; const int cell_thread_x = lidX - cell_x * 12;
__local float* hists = smem; __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
__local float* final_hist = smem + cnbins * 48; CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
__local float* final_hist = hists + cnbins *
(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
const int offset_y = gidY * cblock_stride_y + (cell_y << 2); const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
__global const float* grad_ptr = grad + offset_y * grad_quadstep + (offset_x << 1); __global const float* grad_ptr = (gid < blocks_total) ?
__global const uchar* qangle_ptr = qangle + offset_y * qangle_step + (offset_x << 1); grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
__global const uchar* qangle_ptr = (gid < blocks_total) ?
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
// 12 means that 12 pixels affect on block's cell (in one row) __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
if (cell_thread_x < 12) cell_thread_x;
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
hist[bin_id * 48] = 0.f;
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
const int dist_y_begin = -4 - 4 * lidY;
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
{ {
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + cell_thread_x; float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
for (int bin_id = 0; bin_id < cnbins; ++bin_id) uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
hist[bin_id * 48] = 0.f;
const int dist_x = -4 + cell_thread_x - 4 * cell_x; grad_ptr += grad_quadstep;
qangle_ptr += qangle_step;
const int dist_y_begin = -4 - 4 * lidY; int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
{
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
grad_ptr += grad_quadstep; float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x *
qangle_ptr += qangle_step; dist_center_x) * scale);
float interp_weight = (8.f - fabs(dist_y + 0.5f)) *
(8.f - fabs(dist_x + 0.5f)) / 64.f;
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); hist[bin.x * 48] += gaussian * interp_weight * vote.x;
int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); hist[bin.y * 48] += gaussian * interp_weight * vote.y;
float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale);
float interp_weight = (8.f - fabs(dist_y + 0.5f)) * (8.f - fabs(dist_x + 0.5f)) / 64.f;
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
}
volatile __local float* hist_ = hist;
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if (cell_thread_x < 6) hist_[0] += hist_[6];
if (cell_thread_x < 3) hist_[0] += hist_[3];
if (cell_thread_x == 0)
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2];
}
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
__global float* block_hist = block_hists + (gidY * img_block_width + gidX) * cblock_hist_size; volatile __local float* hist_ = hist;
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if (cell_thread_x < 6)
hist_[0] += hist_[6];
barrier(CLK_LOCAL_MEM_FENCE);
if (cell_thread_x < 3)
hist_[0] += hist_[3];
#ifdef WAVE_SIZE_1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (cell_thread_x == 0)
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
hist_[0] + hist_[1] + hist_[2];
}
#ifdef WAVE_SIZE_1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x; int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
if (tid < cblock_hist_size) if ((tid < cblock_hist_size) && (gid < blocks_total))
{
__global float* block_hist = block_hists +
(gidY * img_block_width + gidX) * cblock_hist_size;
block_hist[tid] = final_hist[tid]; block_hist[tid] = final_hist[tid];
}
} }
//------------------------------------------------------------- //-------------------------------------------------------------
@ -133,21 +153,59 @@ float reduce_smem(volatile __local float* smem, int size)
unsigned int tid = get_local_id(0); unsigned int tid = get_local_id(0);
float sum = smem[tid]; float sum = smem[tid];
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; barrier(CLK_LOCAL_MEM_FENCE); } if (size >= 512)
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; barrier(CLK_LOCAL_MEM_FENCE); } {
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); } if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (size >= 256)
{
if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (size >= 128)
{
if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid < 32) if (tid < 32)
{ {
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) if (tid < 16)
{ {
#endif
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
} }
@ -224,19 +282,44 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
if (tid < 64) products[tid] = product = product + products[tid + 64]; if (tid < 64) products[tid] = product = product + products[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
if (tid < 32) if (tid < 32)
{ {
volatile __local float* smem = products;
smem[tid] = product = product + smem[tid + 32]; smem[tid] = product = product + smem[tid + 32];
#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) if (tid < 16)
{ {
volatile __local float* smem = products; #endif
smem[tid] = product = product + smem[tid + 16]; smem[tid] = product = product + smem[tid + 16];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
smem[tid] = product = product + smem[tid + 8]; smem[tid] = product = product + smem[tid + 8];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
smem[tid] = product = product + smem[tid + 4]; smem[tid] = product = product + smem[tid + 4];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
smem[tid] = product = product + smem[tid + 2]; smem[tid] = product = product + smem[tid + 2];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
smem[tid] = product = product + smem[tid + 1]; smem[tid] = product = product + smem[tid + 1];
} }
@ -248,8 +331,8 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
// Extract descriptors // Extract descriptors
__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width, __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
__global const float* block_hists, __global float* descriptors) __global const float* block_hists, __global float* descriptors)
{ {
int tid = get_local_id(0); int tid = get_local_id(0);
int gidX = get_group_id(0); int gidX = get_group_id(0);
@ -271,8 +354,8 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
} }
__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
{ {
int tid = get_local_id(0); int tid = get_local_id(0);
int gidX = get_group_id(0); int gidX = get_group_id(0);
@ -301,8 +384,8 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
// Gradients computation // Gradients computation
__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, __kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
const __global uchar4 * img, __global float * grad, __global uchar * qangle, const __global uchar4 * img, __global float * grad, __global uchar * qangle,
const float angle_scale, const char correct_gamma, const int cnbins) const float angle_scale, const char correct_gamma, const int cnbins)
{ {
const int x = get_global_id(0); const int x = get_global_id(0);
const int tid = get_local_id(0); const int tid = get_local_id(0);
@ -400,8 +483,8 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
} }
__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, __kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
__global const uchar * img, __global float * grad, __global uchar * qangle, __global const uchar * img, __global float * grad, __global uchar * qangle,
const float angle_scale, const char correct_gamma, const int cnbins) const float angle_scale, const char correct_gamma, const int cnbins)
{ {
const int x = get_global_id(0); const int x = get_global_id(0);
const int tid = get_local_id(0); const int tid = get_local_id(0);

View File

@ -184,6 +184,209 @@ float linearFilter_float(__global const float* src, int srcStep, int cn, float2
} }
#define BUFFER 64 #define BUFFER 64
#ifdef CPU
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
smem3[tid] = val3;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
smem3[tid] = val3 += smem3[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
smem3[tid] = val3 += smem3[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = val1 += smem1[tid + 32];
smem2[tid] = val2 += smem2[tid + 32];
smem3[tid] = val3 += smem3[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = val1 += smem1[tid + 16];
smem2[tid] = val2 += smem2[tid + 16];
smem3[tid] = val3 += smem3[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = val1 += smem1[tid + 8];
smem2[tid] = val2 += smem2[tid + 8];
smem3[tid] = val3 += smem3[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = val1 += smem1[tid + 4];
smem2[tid] = val2 += smem2[tid + 4];
smem3[tid] = val3 += smem3[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = val1 += smem1[tid + 2];
smem2[tid] = val2 += smem2[tid + 2];
smem3[tid] = val3 += smem3[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = val1 += smem1[tid + 1];
smem2[BUFFER] = val2 += smem2[tid + 1];
smem3[BUFFER] = val3 += smem3[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = (val1 += smem1[tid + 128]);
smem2[tid] = (val2 += smem2[tid + 128]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = (val1 += smem1[tid + 64]);
smem2[tid] = (val2 += smem2[tid + 64]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = (val1 += smem1[tid + 32]);
smem2[tid] = (val2 += smem2[tid + 32]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = (val1 += smem1[tid + 16]);
smem2[tid] = (val2 += smem2[tid + 16]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = (val1 += smem1[tid + 8]);
smem2[tid] = (val2 += smem2[tid + 8]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = (val1 += smem1[tid + 4]);
smem2[tid] = (val2 += smem2[tid + 4]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = (val1 += smem1[tid + 2]);
smem2[tid] = (val2 += smem2[tid + 2]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = (val1 += smem1[tid + 1]);
smem2[BUFFER] = (val2 += smem2[tid + 1]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void reduce1(float val1, volatile __local float* smem1, int tid)
{
smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = (val1 += smem1[tid + 128]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = (val1 += smem1[tid + 64]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = (val1 += smem1[tid + 32]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = (val1 += smem1[tid + 16]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = (val1 += smem1[tid + 8]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = (val1 += smem1[tid + 4]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = (val1 += smem1[tid + 2]);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = (val1 += smem1[tid + 1]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
#else
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
{ {
smem1[tid] = val1; smem1[tid] = val1;
@ -325,6 +528,7 @@ void reduce1(float val1, __local float* smem1, int tid)
vmem1[tid] = val1 += vmem1[tid + 1]; vmem1[tid] = val1 += vmem1[tid + 1];
} }
} }
#endif
#define SCALE (1.0f / (1 << 20)) #define SCALE (1.0f / (1 << 20))
#define THRESHOLD 0.01f #define THRESHOLD 0.01f
@ -411,14 +615,20 @@ void GetError4(image2d_t J, const float x, const float y, const float4* Pch, flo
*errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
} }
#define GRIDSIZE 3
__kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
__global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
{ {
#ifdef CPU
__local float smem1[BUFFER+1];
__local float smem2[BUFFER+1];
__local float smem3[BUFFER+1];
#else
__local float smem1[BUFFER]; __local float smem1[BUFFER];
__local float smem2[BUFFER]; __local float smem2[BUFFER];
__local float smem3[BUFFER]; __local float smem3[BUFFER];
#endif
unsigned int xid=get_local_id(0); unsigned int xid=get_local_id(0);
unsigned int yid=get_local_id(1); unsigned int yid=get_local_id(1);
@ -431,7 +641,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
const int tid = mad24(yid, xsize, xid); const int tid = mad24(yid, xsize, xid);
float2 prevPt = prevPts[gid] / (1 << level); float2 prevPt = prevPts[gid] / (float2)(1 << level);
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
{ {
@ -450,9 +660,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
float A12 = 0; float A12 = 0;
float A22 = 0; float A22 = 0;
float I_patch[3][3]; float I_patch[GRIDSIZE][GRIDSIZE];
float dIdx_patch[3][3]; float dIdx_patch[GRIDSIZE][GRIDSIZE];
float dIdy_patch[3][3]; float dIdy_patch[GRIDSIZE][GRIDSIZE];
yBase=yid; yBase=yid;
{ {
@ -512,12 +722,19 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2], &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
&A11, &A12, &A22); &A11, &A12, &A22);
} }
reduce3(A11, A12, A22, smem1, smem2, smem3, tid); reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
A11 = smem1[BUFFER];
A12 = smem2[BUFFER];
A22 = smem3[BUFFER];
#else
A11 = smem1[0]; A11 = smem1[0];
A12 = smem2[0]; A12 = smem2[0];
A22 = smem3[0]; A22 = smem3[0];
#endif
float D = A11 * A22 - A12 * A12; float D = A11 * A22 - A12 * A12;
@ -609,8 +826,13 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
reduce2(b1, b2, smem1, smem2, tid); reduce2(b1, b2, smem1, smem2, tid);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
b1 = smem1[BUFFER];
b2 = smem2[BUFFER];
#else
b1 = smem1[0]; b1 = smem1[0];
b2 = smem2[0]; b2 = smem2[0];
#endif
float2 delta; float2 delta;
delta.x = A12 * b2 - A22 * b1; delta.x = A12 * b2 - A22 * b1;
@ -685,18 +907,28 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
nextPts[gid] = prevPt; nextPts[gid] = prevPt;
if (calcErr) if (calcErr)
err[gid] = smem1[0] / (c_winSize_x * c_winSize_y); #ifdef CPU
err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y);
#else
err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
#endif
} }
} }
__kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
__global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
{ {
__local float smem1[BUFFER]; #ifdef CPU
__local float smem2[BUFFER]; __local float smem1[BUFFER+1];
__local float smem3[BUFFER]; __local float smem2[BUFFER+1];
__local float smem3[BUFFER+1];
#else
__local float smem1[BUFFER];
__local float smem2[BUFFER];
__local float smem3[BUFFER];
#endif
unsigned int xid=get_local_id(0); unsigned int xid=get_local_id(0);
unsigned int yid=get_local_id(1); unsigned int yid=get_local_id(1);
@ -709,7 +941,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
const int tid = mad24(yid, xsize, xid); const int tid = mad24(yid, xsize, xid);
float2 nextPt = prevPts[gid]/(1<<level); float2 nextPt = prevPts[gid]/(float2)(1<<level);
if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows) if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
{ {
@ -725,9 +957,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
// extract the patch from the first image, compute covariation matrix of derivatives // extract the patch from the first image, compute covariation matrix of derivatives
float A11 = 0; float A11 = 0.0f;
float A12 = 0; float A12 = 0.0f;
float A22 = 0; float A22 = 0.0f;
float4 I_patch[8]; float4 I_patch[8];
float4 dIdx_patch[8]; float4 dIdx_patch[8];
@ -797,9 +1029,15 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
reduce3(A11, A12, A22, smem1, smem2, smem3, tid); reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
A11 = smem1[BUFFER];
A12 = smem2[BUFFER];
A22 = smem3[BUFFER];
#else
A11 = smem1[0]; A11 = smem1[0];
A12 = smem2[0]; A12 = smem2[0];
A22 = smem3[0]; A22 = smem3[0];
#endif
float D = A11 * A22 - A12 * A12; float D = A11 * A22 - A12 * A12;
@ -888,12 +1126,16 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
&b1, &b2); &b1, &b2);
} }
reduce2(b1, b2, smem1, smem2, tid); reduce2(b1, b2, smem1, smem2, tid);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
b1 = smem1[BUFFER];
b2 = smem2[BUFFER];
#else
b1 = smem1[0]; b1 = smem1[0];
b2 = smem2[0]; b2 = smem2[0];
#endif
float2 delta; float2 delta;
delta.x = A12 * b2 - A22 * b1; delta.x = A12 * b2 - A22 * b1;
@ -967,7 +1209,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
nextPts[gid] = nextPt; nextPts[gid] = nextPt;
if (calcErr) if (calcErr)
err[gid] = smem1[0] / (3 * c_winSize_x * c_winSize_y); #ifdef CPU
err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y);
#else
err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y);
#endif
} }
} }

View File

@ -226,9 +226,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
// int Y = get_group_id(1) * ROWSperTHREAD + radius; // int Y = get_group_id(1) * ROWSperTHREAD + radius;
#define Y (get_group_id(1) * ROWSperTHREAD + radius) #define Y (get_group_id(1) * ROWSperTHREAD + radius)
volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
__global unsigned char* disparImage = disp + X + Y * disp_step; __global unsigned char* disparImage = disp + X + Y * disp_step;
@ -251,9 +251,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
if (X < cwidth - radius && Y < cheight - radius) if (X < cwidth - radius && Y < cheight - radius)
{ {
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
if (minSSD.x < minSSDImage[0]) if (minSSD.x < minSSDImage[0])
{ {
disparImage[0] = (unsigned char)(d + minSSD.y); disparImage[0] = (unsigned char)(d + minSSD.y);
@ -264,7 +264,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
for(int row = 1; row < end_row; row++) for(int row = 1; row < end_row; row++)
{ {
int idx1 = y_tex * img_step + x_tex; int idx1 = y_tex * img_step + x_tex;
int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex; int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex;
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -278,10 +278,10 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
if (X < cwidth - radius && row < cheight - radius - Y) if (X < cwidth - radius && row < cheight - radius - Y)
{ {
int idx = row * cminSSD_step; int idx = row * cminSSD_step;
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
if (minSSD.x < minSSDImage[idx]) if (minSSD.x < minSSDImage[idx])
{ {
disparImage[disp_step * row] = (unsigned char)(d + minSSD.y); disparImage[disp_step * row] = (unsigned char)(d + minSSD.y);
@ -378,50 +378,50 @@ __kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, in
int beg_row = group_id_y * RpT; int beg_row = group_id_y * RpT;
int end_row = min(beg_row + RpT, disp_rows); int end_row = min(beg_row + RpT, disp_rows);
// if (x < disp_cols) // if (x < disp_cols)
// { // {
int y = beg_row; int y = beg_row;
float sum = 0; float sum = 0;
float sum_extra = 0; float sum_extra = 0;
for(int i = y - winsz2; i <= y + winsz2; ++i) for(int i = y - winsz2; i <= y + winsz2; ++i)
{ {
sum += sobel(input, x - winsz2, i, input_rows, input_cols); sum += sobel(input, x - winsz2, i, input_rows, input_cols);
if (cols_extra)
sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols);
}
*cols = sum;
if (cols_extra) if (cols_extra)
sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols);
}
*cols = sum;
if (cols_extra)
*cols_extra = sum_extra;
barrier(CLK_LOCAL_MEM_FENCE);
float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
if (sum_win < threshold)
disp[y * disp_step + x] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
for(int y = beg_row + 1; y < end_row; ++y)
{
sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
*cols = sum;
if (cols_extra)
{
sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
+ sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
*cols_extra = sum_extra; *cols_extra = sum_extra;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
if (sum_win < threshold) if (sum_win < threshold)
disp[y * disp_step + x] = 0; disp[y * disp_step + x] = 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
}
for(int y = beg_row + 1; y < end_row; ++y) // }
{
sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
*cols = sum;
if (cols_extra)
{
sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
+ sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
*cols_extra = sum_extra;
}
barrier(CLK_LOCAL_MEM_FENCE);
float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
if (sum_win < threshold)
disp[y * disp_step + x] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
}
// }
} }

View File

@ -115,10 +115,9 @@ int main(int argc, char **argv)
std::cout << "platform invalid\n"; std::cout << "platform invalid\n";
return -1; return -1;
} }
if(pid != 0 || device != 0)
{ setDevice(oclinfo[pid], device);
setDevice(oclinfo[pid], device);
}
cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl; cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }

View File

@ -1531,6 +1531,10 @@ INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(false))); Values(false)));
INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(false)));
INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine( INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(false))); // Values(false) is the reserved parameter Values(false))); // Values(false) is the reserved parameter
@ -1586,19 +1590,19 @@ INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC3, CV_32F
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine( INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
Values(CV_8UC1, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
//Values(false) is the reserved parameter //Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine( INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
//Values(false) is the reserved parameter //Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine( INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
//Values(false) is the reserved parameter //Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine( INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
//Values(false) is the reserved parameter //Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false))); INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false)));

View File

@ -43,16 +43,14 @@
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
namespace namespace
{ {
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
// BruteForceMatcher // BruteForceMatcher
CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist,\
CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist, cv::ocl::BruteForceMatcher_OCL_base::L2Dist, cv::ocl::BruteForceMatcher_OCL_base::HammingDist) cv::ocl::BruteForceMatcher_OCL_base::L2Dist,\
cv::ocl::BruteForceMatcher_OCL_base::HammingDist)
IMPLEMENT_PARAM_CLASS(DescriptorSize, int) IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
PARAM_TEST_CASE(BruteForceMatcher, DistType, DescriptorSize)
PARAM_TEST_CASE(BruteForceMatcher/*, NormCode*/, DistType, DescriptorSize)
{ {
//std::vector<cv::ocl::Info> oclinfo;
cv::ocl::BruteForceMatcher_OCL_base::DistType distType; cv::ocl::BruteForceMatcher_OCL_base::DistType distType;
int normCode; int normCode;
int dim; int dim;
@ -64,13 +62,9 @@ namespace
virtual void SetUp() virtual void SetUp()
{ {
//normCode = GET_PARAM(0);
distType = (cv::ocl::BruteForceMatcher_OCL_base::DistType)(int)GET_PARAM(0); distType = (cv::ocl::BruteForceMatcher_OCL_base::DistType)(int)GET_PARAM(0);
dim = GET_PARAM(1); dim = GET_PARAM(1);
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
queryDescCount = 300; // must be even number because we split train data in some cases in two queryDescCount = 300; // must be even number because we split train data in some cases in two
countFactor = 4; // do not change it countFactor = 4; // do not change it
@ -172,49 +166,33 @@ namespace
cv::ocl::BruteForceMatcher_OCL_base matcher(distType); cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
// assume support atomic. std::vector< std::vector<cv::DMatch> > matches;
//if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS)) matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius);
//{
// try ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
// {
// std::vector< std::vector<cv::DMatch> > matches; int badCount = 0;
// matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius); for (size_t i = 0; i < matches.size(); i++)
// }
// catch (const cv::Exception& e)
// {
// ASSERT_EQ(CV_StsNotImplemented, e.code);
// }
//}
//else
{ {
std::vector< std::vector<cv::DMatch> > matches; if ((int)matches[i].size() != 1)
matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius);
ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
int badCount = 0;
for (size_t i = 0; i < matches.size(); i++)
{ {
if ((int)matches[i].size() != 1) badCount++;
{ }
badCount++; else
} {
else cv::DMatch match = matches[i][0];
{ if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
cv::DMatch match = matches[i][0]; badCount++;
if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
badCount++;
}
} }
ASSERT_EQ(0, badCount);
} }
ASSERT_EQ(0, badCount);
} }
INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine( INSTANTIATE_TEST_CASE_P(OCL_Features2D, BruteForceMatcher,
//ALL_DEVICES, testing::Combine(
testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)), testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)),
testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)))); testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304))));
} // namespace } // namespace
#endif #endif

2
modules/python/src2/cv.py Executable file → Normal file
View File

@ -1,3 +1 @@
#/usr/bin/env python
from cv2.cv import * from cv2.cv import *

View File

@ -396,7 +396,7 @@ static PyObject* pyopencv_from(const Mat& m)
if(!p->refcount || p->allocator != &g_numpyAllocator) if(!p->refcount || p->allocator != &g_numpyAllocator)
{ {
temp.allocator = &g_numpyAllocator; temp.allocator = &g_numpyAllocator;
m.copyTo(temp); ERRWRAP2(m.copyTo(temp));
p = &temp; p = &temp;
} }
p->addref(); p->addref();

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys import sys
from string import Template from string import Template

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import hdr_parser, sys, re, os, cStringIO import hdr_parser, sys, re, os, cStringIO
from string import Template from string import Template

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import os, sys, re, string import os, sys, re, string

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
# Calculating and displaying 2D Hue-Saturation histogram of a color image # Calculating and displaying 2D Hue-Saturation histogram of a color image
import sys import sys

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys import sys
import math import math

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys import sys
import cv2.cv as cv import cv2.cv as cv

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import cv2.cv as cv import cv2.cv as cv
import unittest import unittest

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import cv2.cv as cv import cv2.cv as cv
import numpy as np import numpy as np

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import cv2.cv as cv import cv2.cv as cv
import numpy as np import numpy as np

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import cv2.cv as cv import cv2.cv as cv
import math import math

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import cv2.cv as cv import cv2.cv as cv
import math import math

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import cv2.cv as cv import cv2.cv as cv

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import unittest import unittest
import random import random

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import unittest import unittest
import random import random

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import urllib import urllib
import cv2.cv as cv import cv2.cv as cv

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import unittest import unittest
import random import random

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# transformations.py # transformations.py

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import sys, re, os.path import sys, re, os.path
from xml.dom.minidom import parse from xml.dom.minidom import parse

View File

@ -1,11 +0,0 @@
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:tools="http://schemas.android.com/tools"
android:layout_width="match_parent"
android:layout_height="match_parent" >
<org.opencv.android.JavaCameraView
android:layout_width="fill_parent"
android:layout_height="fill_parent"
android:id="@+id/puzzle_activity_surface_view" />
</LinearLayout>

View File

@ -1,6 +0,0 @@
<menu xmlns:android="http://schemas.android.com/apk/res/android">
<item android:id="@+id/menu_start_new_game"
android:title="@string/menu_start_new_game"
android:orderInCategory="100" />
<item android:id="@+id/menu_toggle_tile_numbers" android:title="@string/menu_toggle_tile_numbers"></item>
</menu>

View File

@ -6,6 +6,7 @@ import org.opencv.android.OpenCVLoader;
import org.opencv.core.Mat; import org.opencv.core.Mat;
import org.opencv.android.CameraBridgeViewBase; import org.opencv.android.CameraBridgeViewBase;
import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener; import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener;
import org.opencv.android.JavaCameraView;
import android.os.Bundle; import android.os.Bundle;
import android.app.Activity; import android.app.Activity;
@ -22,6 +23,9 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener,
private CameraBridgeViewBase mOpenCvCameraView; private CameraBridgeViewBase mOpenCvCameraView;
private Puzzle15Processor mPuzzle15; private Puzzle15Processor mPuzzle15;
private MenuItem mItemHideNumbers;
private MenuItem mItemStartNewGame;
private int mGameWidth; private int mGameWidth;
private int mGameHeight; private int mGameHeight;
@ -52,9 +56,9 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener,
super.onCreate(savedInstanceState); super.onCreate(savedInstanceState);
getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON); getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
setContentView(R.layout.activity_puzzle15); Log.d(TAG, "Creating and seting view");
mOpenCvCameraView = (CameraBridgeViewBase) new JavaCameraView(this, -1);
mOpenCvCameraView = (CameraBridgeViewBase) findViewById(R.id.puzzle_activity_surface_view); setContentView(mOpenCvCameraView);
mOpenCvCameraView.setCvCameraViewListener(this); mOpenCvCameraView.setCvCameraViewListener(this);
mPuzzle15 = new Puzzle15Processor(); mPuzzle15 = new Puzzle15Processor();
mPuzzle15.prepareNewGame(); mPuzzle15.prepareNewGame();
@ -83,17 +87,19 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener,
@Override @Override
public boolean onCreateOptionsMenu(Menu menu) { public boolean onCreateOptionsMenu(Menu menu) {
getMenuInflater().inflate(R.menu.activity_puzzle15, menu); Log.i(TAG, "called onCreateOptionsMenu");
mItemHideNumbers = menu.add("Show/hide tile numbers");
mItemStartNewGame = menu.add("Start new game");
return true; return true;
} }
@Override @Override
public boolean onOptionsItemSelected(MenuItem item) { public boolean onOptionsItemSelected(MenuItem item) {
Log.i(TAG, "Menu Item selected " + item); Log.i(TAG, "Menu Item selected " + item);
if (item.getItemId() == R.id.menu_start_new_game) { if (item == mItemStartNewGame) {
/* We need to start new game */ /* We need to start new game */
mPuzzle15.prepareNewGame(); mPuzzle15.prepareNewGame();
} else if (item.getItemId() == R.id.menu_toggle_tile_numbers) { } else if (item == mItemHideNumbers) {
/* We need to enable or disable drawing of the tile numbers */ /* We need to enable or disable drawing of the tile numbers */
mPuzzle15.toggleTileNumbers(); mPuzzle15.toggleTileNumbers();
} }

View File

@ -6,7 +6,7 @@ include ../../sdk/native/jni/OpenCV.mk
LOCAL_MODULE := native_activity LOCAL_MODULE := native_activity
LOCAL_SRC_FILES := native.cpp LOCAL_SRC_FILES := native.cpp
LOCAL_LDLIBS := -lm -llog -landroid LOCAL_LDLIBS += -lm -llog -landroid
LOCAL_STATIC_LIBRARIES := android_native_app_glue LOCAL_STATIC_LIBRARIES := android_native_app_glue
include $(BUILD_SHARED_LIBRARY) include $(BUILD_SHARED_LIBRARY)

View File

@ -1,2 +1,4 @@
APP_ABI := armeabi-v7a APP_ABI := armeabi-v7a
APP_STL := gnustl_static
APP_CPPFLAGS := -frtti -fexceptions
APP_PLATFORM := android-9 APP_PLATFORM := android-9

View File

@ -73,7 +73,7 @@ int main( int argc, char** argv ) {
} }
Mat imgB = imread(argv[2], IMREAD_GRAYSCALE ); Mat imgB = imread(argv[2], IMREAD_GRAYSCALE );
if( !imgA.data ) { if( !imgB.data ) {
std::cout << " --(!) Error reading image " << argv[2] << std::endl; std::cout << " --(!) Error reading image " << argv[2] << std::endl;
return -1; return -1;
} }

View File

@ -12,9 +12,8 @@ static void help()
{ {
// print a welcome message, and the OpenCV version // print a welcome message, and the OpenCV version
cout << "\nThis is a demo of Lukas-Kanade optical flow lkdemo(),\n" cout << "\nThis is a demo of Lukas-Kanade optical flow lkdemo(),\n"
"Using OpenCV version %s\n" << CV_VERSION << "\n" "Using OpenCV version " << CV_VERSION << endl;
<< endl; cout << "\nIt uses camera by default, but you can provide a path to video as an argument.\n";
cout << "\nHot keys: \n" cout << "\nHot keys: \n"
"\tESC - quit the program\n" "\tESC - quit the program\n"
"\tr - auto-initialize tracking\n" "\tr - auto-initialize tracking\n"
@ -30,13 +29,15 @@ static void onMouse( int event, int x, int y, int /*flags*/, void* /*param*/ )
{ {
if( event == EVENT_LBUTTONDOWN ) if( event == EVENT_LBUTTONDOWN )
{ {
point = Point2f((float)x,(float)y); point = Point2f((float)x, (float)y);
addRemovePt = true; addRemovePt = true;
} }
} }
int main( int argc, char** argv ) int main( int argc, char** argv )
{ {
help();
VideoCapture cap; VideoCapture cap;
TermCriteria termcrit(TermCriteria::COUNT|TermCriteria::EPS,20,0.03); TermCriteria termcrit(TermCriteria::COUNT|TermCriteria::EPS,20,0.03);
Size subPixWinSize(10,10), winSize(31,31); Size subPixWinSize(10,10), winSize(31,31);
@ -56,8 +57,6 @@ int main( int argc, char** argv )
return 0; return 0;
} }
help();
namedWindow( "LK Demo", 1 ); namedWindow( "LK Demo", 1 );
setMouseCallback( "LK Demo", onMouse, 0 ); setMouseCallback( "LK Demo", onMouse, 0 );
@ -134,17 +133,16 @@ int main( int argc, char** argv )
needToInit = true; needToInit = true;
break; break;
case 'c': case 'c':
points[0].clear();
points[1].clear(); points[1].clear();
break; break;
case 'n': case 'n':
nightMode = !nightMode; nightMode = !nightMode;
break; break;
default:
;
} }
std::swap(points[1], points[0]); std::swap(points[1], points[0]);
swap(prevGray, gray); cv::swap(prevGray, gray);
} }
return 0; return 0;

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
''' '''
This module contains some common routines used by other samples. This module contains some common routines used by other samples.
''' '''

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
import cv2 import cv2
import numpy as np import numpy as np

View File

@ -1,4 +1,4 @@
#/usr/bin/env python #!/usr/bin/env python
''' '''
Watershed segmentation Watershed segmentation