mirror of
https://github.com/opencv/opencv.git
synced 2025-01-05 18:05:31 +08:00
Merge branch '2.4'
This commit is contained in:
commit
3b364330ad
@ -4,23 +4,43 @@ import android.os.IBinder;
|
|||||||
|
|
||||||
public class BinderConnector
|
public class BinderConnector
|
||||||
{
|
{
|
||||||
public BinderConnector(MarketConnector Market)
|
public BinderConnector(MarketConnector Market) {
|
||||||
{
|
mMarket = Market;
|
||||||
Init(Market);
|
|
||||||
}
|
|
||||||
public native IBinder Connect();
|
|
||||||
public boolean Disconnect()
|
|
||||||
{
|
|
||||||
Final();
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static
|
public boolean Init() {
|
||||||
|
boolean result = false;
|
||||||
|
if (mIsReady)
|
||||||
|
result = Init(mMarket);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public native IBinder Connect();
|
||||||
|
|
||||||
|
public boolean Disconnect()
|
||||||
{
|
{
|
||||||
System.loadLibrary("OpenCVEngine");
|
if (mIsReady)
|
||||||
System.loadLibrary("OpenCVEngine_jni");
|
Final();
|
||||||
|
|
||||||
|
return mIsReady;
|
||||||
}
|
}
|
||||||
|
|
||||||
private native boolean Init(MarketConnector Market);
|
private native boolean Init(MarketConnector Market);
|
||||||
public native void Final();
|
private native void Final();
|
||||||
|
private static boolean mIsReady = false;
|
||||||
|
private MarketConnector mMarket;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
System.loadLibrary("OpenCVEngine");
|
||||||
|
System.loadLibrary("OpenCVEngine_jni");
|
||||||
|
mIsReady = true;
|
||||||
|
}
|
||||||
|
catch(UnsatisfiedLinkError e) {
|
||||||
|
mIsReady = false;
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -47,9 +47,17 @@ public class HardwareDetector
|
|||||||
|
|
||||||
public static native int DetectKnownPlatforms();
|
public static native int DetectKnownPlatforms();
|
||||||
|
|
||||||
static
|
public static boolean mIsReady = false;
|
||||||
{
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
System.loadLibrary("OpenCVEngine");
|
System.loadLibrary("OpenCVEngine");
|
||||||
System.loadLibrary("OpenCVEngine_jni");
|
System.loadLibrary("OpenCVEngine_jni");
|
||||||
|
mIsReady = true;
|
||||||
|
}
|
||||||
|
catch(UnsatisfiedLinkError e) {
|
||||||
|
mIsReady = false;
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,31 +3,62 @@ package org.opencv.engine;
|
|||||||
import android.app.Service;
|
import android.app.Service;
|
||||||
import android.content.Intent;
|
import android.content.Intent;
|
||||||
import android.os.IBinder;
|
import android.os.IBinder;
|
||||||
|
import android.os.RemoteException;
|
||||||
import android.util.Log;
|
import android.util.Log;
|
||||||
|
|
||||||
|
|
||||||
public class OpenCVEngineService extends Service
|
public class OpenCVEngineService extends Service
|
||||||
{
|
{
|
||||||
private static final String TAG = "OpenCVEngine/Service";
|
private static final String TAG = "OpenCVEngine/Service";
|
||||||
private IBinder mEngineInterface;
|
private IBinder mEngineInterface = null;
|
||||||
private MarketConnector mMarket;
|
private MarketConnector mMarket;
|
||||||
private BinderConnector mNativeBinder;
|
private BinderConnector mNativeBinder;
|
||||||
public void onCreate()
|
|
||||||
{
|
public void onCreate() {
|
||||||
Log.i(TAG, "Service starting");
|
Log.i(TAG, "Service starting");
|
||||||
super.onCreate();
|
super.onCreate();
|
||||||
Log.i(TAG, "Engine binder component creating");
|
Log.i(TAG, "Engine binder component creating");
|
||||||
mMarket = new MarketConnector(getBaseContext());
|
mMarket = new MarketConnector(getBaseContext());
|
||||||
mNativeBinder = new BinderConnector(mMarket);
|
mNativeBinder = new BinderConnector(mMarket);
|
||||||
|
if (mNativeBinder.Init()) {
|
||||||
mEngineInterface = mNativeBinder.Connect();
|
mEngineInterface = mNativeBinder.Connect();
|
||||||
Log.i(TAG, "Service started successfully");
|
Log.i(TAG, "Service started successfully");
|
||||||
|
} else {
|
||||||
|
Log.e(TAG, "Cannot initialize native part of OpenCV Manager!");
|
||||||
|
Log.e(TAG, "Using stub instead");
|
||||||
|
|
||||||
|
mEngineInterface = new OpenCVEngineInterface.Stub() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean installVersion(String version) throws RemoteException {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IBinder onBind(Intent intent)
|
@Override
|
||||||
{
|
public String getLibraryList(String version) throws RemoteException {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getLibPathByVersion(String version) throws RemoteException {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getEngineVersion() throws RemoteException {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public IBinder onBind(Intent intent) {
|
||||||
Log.i(TAG, "Service onBind called for intent " + intent.toString());
|
Log.i(TAG, "Service onBind called for intent " + intent.toString());
|
||||||
return mEngineInterface;
|
return mEngineInterface;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean onUnbind(Intent intent)
|
public boolean onUnbind(Intent intent)
|
||||||
{
|
{
|
||||||
Log.i(TAG, "Service onUnbind called for intent " + intent.toString());
|
Log.i(TAG, "Service onUnbind called for intent " + intent.toString());
|
||||||
|
@ -42,6 +42,26 @@ public class ManagerActivity extends Activity
|
|||||||
@Override
|
@Override
|
||||||
public void onCreate(Bundle savedInstanceState) {
|
public void onCreate(Bundle savedInstanceState) {
|
||||||
super.onCreate(savedInstanceState);
|
super.onCreate(savedInstanceState);
|
||||||
|
|
||||||
|
if (!HardwareDetector.mIsReady) {
|
||||||
|
Log.e(TAG, "Cannot initialize native part of OpenCV Manager!");
|
||||||
|
|
||||||
|
AlertDialog dialog = new AlertDialog.Builder(this).create();
|
||||||
|
|
||||||
|
dialog.setTitle("OpenCV Manager Error");
|
||||||
|
dialog.setMessage("OpenCV Manager is incompatible with this device. Please replace it with an appropriate package.");
|
||||||
|
dialog.setCancelable(false);
|
||||||
|
dialog.setButton("OK", new DialogInterface.OnClickListener() {
|
||||||
|
|
||||||
|
public void onClick(DialogInterface dialog, int which) {
|
||||||
|
finish();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
dialog.show();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
setContentView(R.layout.main);
|
setContentView(R.layout.main);
|
||||||
|
|
||||||
TextView OsVersionView = (TextView)findViewById(R.id.OsVersionValue);
|
TextView OsVersionView = (TextView)findViewById(R.id.OsVersionValue);
|
||||||
@ -186,6 +206,20 @@ public class ManagerActivity extends Activity
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
mPackageChangeReciever = new BroadcastReceiver() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onReceive(Context context, Intent intent) {
|
||||||
|
Log.d("OpenCVManager/Reciever", "Bradcast message " + intent.getAction() + " reciever");
|
||||||
|
Log.d("OpenCVManager/Reciever", "Filling package list on broadcast message");
|
||||||
|
if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE))
|
||||||
|
{
|
||||||
|
TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
|
||||||
|
EngineVersionView.setText("not avaliable");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
IntentFilter filter = new IntentFilter();
|
IntentFilter filter = new IntentFilter();
|
||||||
filter.addAction(Intent.ACTION_PACKAGE_ADDED);
|
filter.addAction(Intent.ACTION_PACKAGE_ADDED);
|
||||||
filter.addAction(Intent.ACTION_PACKAGE_CHANGED);
|
filter.addAction(Intent.ACTION_PACKAGE_CHANGED);
|
||||||
@ -199,17 +233,23 @@ public class ManagerActivity extends Activity
|
|||||||
@Override
|
@Override
|
||||||
protected void onDestroy() {
|
protected void onDestroy() {
|
||||||
super.onDestroy();
|
super.onDestroy();
|
||||||
|
if (mPackageChangeReciever != null)
|
||||||
unregisterReceiver(mPackageChangeReciever);
|
unregisterReceiver(mPackageChangeReciever);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void onResume() {
|
protected void onResume() {
|
||||||
super.onResume();
|
super.onResume();
|
||||||
|
if (HardwareDetector.mIsReady) {
|
||||||
Log.d(TAG, "Filling package list on resume");
|
Log.d(TAG, "Filling package list on resume");
|
||||||
if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE))
|
OpenCVEngineServiceConnection connection = new OpenCVEngineServiceConnection();
|
||||||
{
|
if (!bindService(new Intent("org.opencv.engine.BIND"), connection, Context.BIND_AUTO_CREATE)) {
|
||||||
|
Log.e(TAG, "Cannot bind to OpenCV Manager service!");
|
||||||
TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
|
TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
|
||||||
|
if (EngineVersionView != null)
|
||||||
EngineVersionView.setText("not avaliable");
|
EngineVersionView.setText("not avaliable");
|
||||||
|
unbindService(connection);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -225,19 +265,7 @@ public class ManagerActivity extends Activity
|
|||||||
protected int ManagerApiLevel = 0;
|
protected int ManagerApiLevel = 0;
|
||||||
protected String ManagerVersion;
|
protected String ManagerVersion;
|
||||||
|
|
||||||
protected BroadcastReceiver mPackageChangeReciever = new BroadcastReceiver() {
|
protected BroadcastReceiver mPackageChangeReciever = null;
|
||||||
|
|
||||||
@Override
|
|
||||||
public void onReceive(Context context, Intent intent) {
|
|
||||||
Log.d("OpenCVManager/Reciever", "Bradcast message " + intent.getAction() + " reciever");
|
|
||||||
Log.d("OpenCVManager/Reciever", "Filling package list on broadcast message");
|
|
||||||
if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE))
|
|
||||||
{
|
|
||||||
TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
|
|
||||||
EngineVersionView.setText("not avaliable");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
protected class OpenCVEngineServiceConnection implements ServiceConnection
|
protected class OpenCVEngineServiceConnection implements ServiceConnection
|
||||||
{
|
{
|
||||||
@ -246,6 +274,12 @@ public class ManagerActivity extends Activity
|
|||||||
|
|
||||||
public void onServiceConnected(ComponentName name, IBinder service) {
|
public void onServiceConnected(ComponentName name, IBinder service) {
|
||||||
OpenCVEngineInterface EngineService = OpenCVEngineInterface.Stub.asInterface(service);
|
OpenCVEngineInterface EngineService = OpenCVEngineInterface.Stub.asInterface(service);
|
||||||
|
if (EngineService == null) {
|
||||||
|
Log.e(TAG, "Cannot connect to OpenCV Manager Service!");
|
||||||
|
unbindService(this);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ManagerApiLevel = EngineService.getEngineVersion();
|
ManagerApiLevel = EngineService.getEngineVersion();
|
||||||
} catch (RemoteException e) {
|
} catch (RemoteException e) {
|
||||||
|
@ -99,7 +99,11 @@ elseif(CMAKE_COMPILER_IS_GNUCXX)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(MINGW64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR CMAKE_GENERATOR MATCHES "Visual Studio.*Win64")
|
if(MSVC64 OR MINGW64)
|
||||||
|
set(X86_64 1)
|
||||||
|
elseif(MSVC AND NOT CMAKE_CROSSCOMPILING)
|
||||||
|
set(X86 1)
|
||||||
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||||
set(X86_64 1)
|
set(X86_64 1)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||||
set(X86 1)
|
set(X86 1)
|
||||||
|
@ -4,7 +4,7 @@ if(APPLE)
|
|||||||
set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory")
|
set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory")
|
||||||
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
|
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
|
||||||
else(APPLE)
|
else(APPLE)
|
||||||
find_package(OpenCL QUIET)
|
#find_package(OpenCL QUIET)
|
||||||
|
|
||||||
if (NOT OPENCL_FOUND)
|
if (NOT OPENCL_FOUND)
|
||||||
find_path(OPENCL_ROOT_DIR
|
find_path(OPENCL_ROOT_DIR
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys, glob
|
import sys, glob
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import os, sys, fnmatch, re
|
import os, sys, fnmatch, re
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
#
|
#
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
ocv domain, a modified copy of sphinx.domains.cpp + shpinx.domains.python.
|
ocv domain, a modified copy of sphinx.domains.cpp + shpinx.domains.python.
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
"""gen_pattern.py
|
"""gen_pattern.py
|
||||||
To run:
|
To run:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
# svgfig.py copyright (C) 2008 Jim Pivarski <jpivarski@gmail.com>
|
# svgfig.py copyright (C) 2008 Jim Pivarski <jpivarski@gmail.com>
|
||||||
#
|
#
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import os, sys, re
|
import os, sys, re
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os.path
|
import os.path
|
||||||
|
@ -455,7 +455,7 @@ protected:
|
|||||||
TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); }
|
TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); }
|
||||||
*/
|
*/
|
||||||
|
|
||||||
TEST(Core_globbing, accurasy)
|
TEST(Core_globbing, accuracy)
|
||||||
{
|
{
|
||||||
std::string patternLena = cvtest::TS::ptr()->get_data_path() + "lena*.*";
|
std::string patternLena = cvtest::TS::ptr()->get_data_path() + "lena*.*";
|
||||||
std::string patternLenaPng = cvtest::TS::ptr()->get_data_path() + "lena.png";
|
std::string patternLenaPng = cvtest::TS::ptr()->get_data_path() + "lena.png";
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys, re
|
import sys, re
|
||||||
|
|
||||||
|
@ -154,6 +154,11 @@ the symptoms were damaged image and 'Corrupt JPEG data: premature end of data se
|
|||||||
- USE_TEMP_BUFFER fixes the main problem (improper buffer management) and
|
- USE_TEMP_BUFFER fixes the main problem (improper buffer management) and
|
||||||
prevents bad images in the first place
|
prevents bad images in the first place
|
||||||
|
|
||||||
|
11th patch: April 2, 2013, Forrest Reiling forrest.reiling@gmail.com
|
||||||
|
Added v4l2 support for getting capture property CV_CAP_PROP_POS_MSEC.
|
||||||
|
Returns the millisecond timestamp of the last frame grabbed or 0 if no frames have been grabbed
|
||||||
|
Used to successfully synchonize 2 Logitech C310 USB webcams to within 16 ms of one another
|
||||||
|
|
||||||
|
|
||||||
make & enjoy!
|
make & enjoy!
|
||||||
|
|
||||||
@ -320,6 +325,8 @@ typedef struct CvCaptureCAM_V4L
|
|||||||
struct v4l2_queryctrl queryctrl;
|
struct v4l2_queryctrl queryctrl;
|
||||||
struct v4l2_querymenu querymenu;
|
struct v4l2_querymenu querymenu;
|
||||||
|
|
||||||
|
struct timeval timestamp;
|
||||||
|
|
||||||
/* V4L2 control variables */
|
/* V4L2 control variables */
|
||||||
int v4l2_brightness, v4l2_brightness_min, v4l2_brightness_max;
|
int v4l2_brightness, v4l2_brightness_min, v4l2_brightness_max;
|
||||||
int v4l2_contrast, v4l2_contrast_min, v4l2_contrast_max;
|
int v4l2_contrast, v4l2_contrast_min, v4l2_contrast_max;
|
||||||
@ -836,6 +843,9 @@ static int _capture_V4L2 (CvCaptureCAM_V4L *capture, char *deviceName)
|
|||||||
capture->v4l2_gain_max = 0;
|
capture->v4l2_gain_max = 0;
|
||||||
capture->v4l2_exposure_max = 0;
|
capture->v4l2_exposure_max = 0;
|
||||||
|
|
||||||
|
capture->timestamp.tv_sec = 0;
|
||||||
|
capture->timestamp.tv_usec = 0;
|
||||||
|
|
||||||
/* Scan V4L2 controls */
|
/* Scan V4L2 controls */
|
||||||
v4l2_scan_controls(capture);
|
v4l2_scan_controls(capture);
|
||||||
|
|
||||||
@ -1221,6 +1231,9 @@ static int read_frame_v4l2(CvCaptureCAM_V4L* capture) {
|
|||||||
if (-1 == ioctl (capture->deviceHandle, VIDIOC_QBUF, &buf))
|
if (-1 == ioctl (capture->deviceHandle, VIDIOC_QBUF, &buf))
|
||||||
perror ("VIDIOC_QBUF");
|
perror ("VIDIOC_QBUF");
|
||||||
|
|
||||||
|
//set timestamp in capture struct to be timestamp of most recent frame
|
||||||
|
capture->timestamp = buf.timestamp;
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2308,6 +2321,13 @@ static double icvGetPropertyCAM_V4L (CvCaptureCAM_V4L* capture,
|
|||||||
/* initialize the control structure */
|
/* initialize the control structure */
|
||||||
|
|
||||||
switch (property_id) {
|
switch (property_id) {
|
||||||
|
case CV_CAP_PROP_POS_MSEC:
|
||||||
|
if (capture->FirstCapture) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
return 1000 * capture->timestamp.tv_sec + ((double) capture->timestamp.tv_usec) / 1000;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case CV_CAP_PROP_BRIGHTNESS:
|
case CV_CAP_PROP_BRIGHTNESS:
|
||||||
capture->control.id = V4L2_CID_BRIGHTNESS;
|
capture->control.id = V4L2_CID_BRIGHTNESS;
|
||||||
break;
|
break;
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys, os, re
|
import sys, os, re
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys, re, os.path
|
import sys, re, os.path
|
||||||
from string import Template
|
from string import Template
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import os, sys, re, string, glob
|
import os, sys, re, string, glob
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import os, sys, re, string, fnmatch
|
import os, sys, re, string, fnmatch
|
||||||
allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "gpu", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "softcascade", "superres"]
|
allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "gpu", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "softcascade", "superres"]
|
||||||
|
@ -128,7 +128,8 @@ namespace cv
|
|||||||
enum DEVICE_INFO
|
enum DEVICE_INFO
|
||||||
{
|
{
|
||||||
WAVEFRONT_SIZE, //in AMD speak
|
WAVEFRONT_SIZE, //in AMD speak
|
||||||
WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak
|
WARP_SIZE = WAVEFRONT_SIZE, //in nvidia speak
|
||||||
|
IS_CPU_DEVICE //check if the device is CPU
|
||||||
};
|
};
|
||||||
//info should have been pre-allocated
|
//info should have been pre-allocated
|
||||||
void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info);
|
void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info);
|
||||||
|
@ -91,9 +91,6 @@ namespace cv
|
|||||||
extern const char *arithm_bitwise_xor_scalar_mask;
|
extern const char *arithm_bitwise_xor_scalar_mask;
|
||||||
extern const char *arithm_compare_eq;
|
extern const char *arithm_compare_eq;
|
||||||
extern const char *arithm_compare_ne;
|
extern const char *arithm_compare_ne;
|
||||||
extern const char *arithm_sub;
|
|
||||||
extern const char *arithm_sub_scalar;
|
|
||||||
extern const char *arithm_sub_scalar_mask;
|
|
||||||
extern const char *arithm_mul;
|
extern const char *arithm_mul;
|
||||||
extern const char *arithm_div;
|
extern const char *arithm_div;
|
||||||
extern const char *arithm_absdiff;
|
extern const char *arithm_absdiff;
|
||||||
@ -260,11 +257,11 @@ void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const ocl
|
|||||||
|
|
||||||
void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst)
|
void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst)
|
||||||
{
|
{
|
||||||
arithmetic_run(src1, src2, dst, "arithm_sub", &arithm_sub);
|
arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add);
|
||||||
}
|
}
|
||||||
void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
|
void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
|
||||||
{
|
{
|
||||||
arithmetic_run(src1, src2, dst, mask, "arithm_sub_with_mask", &arithm_sub);
|
arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add);
|
||||||
}
|
}
|
||||||
typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName,
|
typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName,
|
||||||
const char **kernelString, void *scalar);
|
const char **kernelString, void *scalar);
|
||||||
@ -451,14 +448,16 @@ void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const ocl
|
|||||||
|
|
||||||
void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
|
void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
|
||||||
{
|
{
|
||||||
String kernelName = mask.data ? "arithm_s_sub_with_mask" : "arithm_s_sub";
|
String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
|
||||||
const char **kernelString = mask.data ? &arithm_sub_scalar_mask : &arithm_sub_scalar;
|
const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
|
||||||
|
|
||||||
arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1);
|
arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1);
|
||||||
}
|
}
|
||||||
void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask)
|
void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask)
|
||||||
{
|
{
|
||||||
String kernelName = mask.data ? "arithm_s_sub_with_mask" : "arithm_s_sub";
|
String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
|
||||||
const char **kernelString = mask.data ? &arithm_sub_scalar_mask : &arithm_sub_scalar;
|
const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
|
||||||
|
|
||||||
arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1);
|
arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1);
|
||||||
}
|
}
|
||||||
void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
|
void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
|
||||||
|
@ -394,6 +394,15 @@ namespace cv
|
|||||||
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case IS_CPU_DEVICE:
|
||||||
|
{
|
||||||
|
cl_device_type devicetype;
|
||||||
|
openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum],
|
||||||
|
CL_DEVICE_TYPE, sizeof(cl_device_type),
|
||||||
|
&devicetype, NULL));
|
||||||
|
*(bool*)info = (devicetype == CVCL_DEVICE_TYPE_CPU);
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
CV_Error(-1, "Invalid device info type");
|
CV_Error(-1, "Invalid device info type");
|
||||||
break;
|
break;
|
||||||
|
@ -393,7 +393,7 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
|
|||||||
if( rtype < 0 )
|
if( rtype < 0 )
|
||||||
rtype = type();
|
rtype = type();
|
||||||
else
|
else
|
||||||
rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
|
rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), oclchannels());
|
||||||
|
|
||||||
//int scn = channels();
|
//int scn = channels();
|
||||||
int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
|
int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
|
||||||
|
@ -52,6 +52,11 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ARITHM_ADD
|
||||||
|
#define ARITHM_OP(A,B) ((A)+(B))
|
||||||
|
#elif defined ARITHM_SUB
|
||||||
|
#define ARITHM_OP(A,B) ((A)-(B))
|
||||||
|
#endif
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
/////////////////////////////////////////////ADD////////////////////////////////////////////////////
|
/////////////////////////////////////////////ADD////////////////////////////////////////////////////
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -95,7 +100,7 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
|
|||||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||||
}
|
}
|
||||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||||
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
|
short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
|
||||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
||||||
|
|
||||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||||
@ -134,7 +139,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
|
|||||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||||
|
|
||||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||||
int4 tmp = convert_int4_sat(src1_data) + convert_int4_sat(src2_data);
|
int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
|
||||||
ushort4 tmp_data = convert_ushort4_sat(tmp);
|
ushort4 tmp_data = convert_ushort4_sat(tmp);
|
||||||
|
|
||||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||||
@ -172,7 +177,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
|
|||||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||||
|
|
||||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||||
int4 tmp = convert_int4_sat(src1_data) + convert_int4_sat(src2_data);
|
int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
|
||||||
short4 tmp_data = convert_short4_sat(tmp);
|
short4 tmp_data = convert_short4_sat(tmp);
|
||||||
|
|
||||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||||
@ -200,7 +205,7 @@ __kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset,
|
|||||||
|
|
||||||
int data1 = *((__global int *)((__global char *)src1 + src1_index));
|
int data1 = *((__global int *)((__global char *)src1 + src1_index));
|
||||||
int data2 = *((__global int *)((__global char *)src2 + src2_index));
|
int data2 = *((__global int *)((__global char *)src2 + src2_index));
|
||||||
long tmp = (long)(data1) + (long)(data2);
|
long tmp = ARITHM_OP((long)(data1), (long)(data2));
|
||||||
|
|
||||||
*((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
|
*((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
|
||||||
}
|
}
|
||||||
@ -221,7 +226,7 @@ __kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offse
|
|||||||
|
|
||||||
float data1 = *((__global float *)((__global char *)src1 + src1_index));
|
float data1 = *((__global float *)((__global char *)src1 + src1_index));
|
||||||
float data2 = *((__global float *)((__global char *)src2 + src2_index));
|
float data2 = *((__global float *)((__global char *)src2 + src2_index));
|
||||||
float tmp = data1 + data2;
|
float tmp = ARITHM_OP(data1, data2);
|
||||||
|
|
||||||
*((__global float *)((__global char *)dst + dst_index)) = tmp;
|
*((__global float *)((__global char *)dst + dst_index)) = tmp;
|
||||||
}
|
}
|
||||||
@ -245,7 +250,7 @@ __kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offs
|
|||||||
double data1 = *((__global double *)((__global char *)src1 + src1_index));
|
double data1 = *((__global double *)((__global char *)src1 + src1_index));
|
||||||
double data2 = *((__global double *)((__global char *)src2 + src2_index));
|
double data2 = *((__global double *)((__global char *)src2 + src2_index));
|
||||||
|
|
||||||
*((__global double *)((__global char *)dst + dst_index)) = data1 + data2;
|
*((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -302,7 +307,7 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
|
|||||||
}
|
}
|
||||||
|
|
||||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||||
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
|
short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
|
||||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
||||||
|
|
||||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||||
@ -344,7 +349,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
|
|||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||||
|
|
||||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
||||||
int2 tmp = convert_int2_sat(src1_data) + convert_int2_sat(src2_data);
|
int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
|
||||||
ushort2 tmp_data = convert_ushort2_sat(tmp);
|
ushort2 tmp_data = convert_ushort2_sat(tmp);
|
||||||
|
|
||||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
||||||
@ -384,7 +389,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
|
|||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||||
|
|
||||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||||
int2 tmp = convert_int2_sat(src1_data) + convert_int2_sat(src2_data);
|
int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
|
||||||
short2 tmp_data = convert_short2_sat(tmp);
|
short2 tmp_data = convert_short2_sat(tmp);
|
||||||
|
|
||||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
||||||
@ -416,7 +421,7 @@ __kernel void arithm_add_with_mask_C1_D4 (__global int *src1, int src1_step, i
|
|||||||
int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
|
int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
|
||||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
int data = convert_int_sat((long)src_data1 + (long)src_data2);
|
int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -446,7 +451,7 @@ __kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, i
|
|||||||
float src_data2 = *((__global float *)((__global char *)src2 + src2_index));
|
float src_data2 = *((__global float *)((__global char *)src2 + src2_index));
|
||||||
float dst_data = *((__global float *)((__global char *)dst + dst_index));
|
float dst_data = *((__global float *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
float data = src_data1 + src_data2;
|
float data = ARITHM_OP(src_data1, src_data2);
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global float *)((__global char *)dst + dst_index)) = data;
|
*((__global float *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -477,7 +482,7 @@ __kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step,
|
|||||||
double src_data2 = *((__global double *)((__global char *)src2 + src2_index));
|
double src_data2 = *((__global double *)((__global char *)src2 + src2_index));
|
||||||
double dst_data = *((__global double *)((__global char *)dst + dst_index));
|
double dst_data = *((__global double *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
double data = src_data1 + src_data2;
|
double data = ARITHM_OP(src_data1, src_data2);
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global double *)((__global char *)dst + dst_index)) = data;
|
*((__global double *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -516,7 +521,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
|
|||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||||
|
|
||||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||||
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
|
short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
|
||||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
||||||
|
|
||||||
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
|
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
|
||||||
@ -548,7 +553,7 @@ __kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step,
|
|||||||
ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
|
ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
|
||||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2);
|
int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
|
||||||
ushort2 data = convert_ushort2_sat(tmp);
|
ushort2 data = convert_ushort2_sat(tmp);
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
@ -578,7 +583,7 @@ __kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, i
|
|||||||
short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
|
short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
|
||||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2);
|
int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
|
||||||
short2 data = convert_short2_sat(tmp);
|
short2 data = convert_short2_sat(tmp);
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
@ -608,7 +613,7 @@ __kernel void arithm_add_with_mask_C2_D4 (__global int *src1, int src1_step, i
|
|||||||
int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
|
int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
|
||||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
|
int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -637,7 +642,7 @@ __kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, i
|
|||||||
float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index));
|
float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index));
|
||||||
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
|
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
float2 data = src_data1 + src_data2;
|
float2 data = ARITHM_OP(src_data1, src_data2);
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global float2 *)((__global char *)dst + dst_index)) = data;
|
*((__global float2 *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -668,329 +673,14 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step,
|
|||||||
double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index));
|
double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index));
|
||||||
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
|
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
double2 data = src_data1 + src_data2;
|
double2 data = ARITHM_OP(src_data1, src_data2);
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global double2 *)((__global char *)dst + dst_index)) = data;
|
*((__global double2 *)((__global char *)dst + dst_index)) = data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
__kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *src2, int src2_step, int src2_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
int rows, int cols, int dst_step1)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 2;
|
|
||||||
|
|
||||||
#ifdef dst_align
|
|
||||||
#undef dst_align
|
|
||||||
#endif
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
|
||||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
|
||||||
|
|
||||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
|
||||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
|
||||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
|
||||||
|
|
||||||
uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
|
|
||||||
uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
|
|
||||||
uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
|
|
||||||
|
|
||||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
|
||||||
|
|
||||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
|
||||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
|
||||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
|
|
||||||
|
|
||||||
uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) + convert_short4_sat(src2_data_0));
|
|
||||||
uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) + convert_short4_sat(src2_data_1));
|
|
||||||
uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2));
|
|
||||||
|
|
||||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
|
||||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
|
||||||
? tmp_data_0.w : data_0.w;
|
|
||||||
|
|
||||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
|
||||||
? tmp_data_1.xy : data_1.xy;
|
|
||||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.zw : data_1.zw;
|
|
||||||
|
|
||||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.x : data_2.x;
|
|
||||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
|
||||||
? tmp_data_2.yzw : data_2.yzw;
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *src2, int src2_step, int src2_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
int rows, int cols, int dst_step1)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#ifdef dst_align
|
|
||||||
#undef dst_align
|
|
||||||
#endif
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
|
||||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
|
||||||
|
|
||||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
|
|
||||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
|
||||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
|
|
||||||
ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
|
|
||||||
ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
|
|
||||||
|
|
||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
|
||||||
|
|
||||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
|
|
||||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
|
|
||||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0));
|
|
||||||
ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1));
|
|
||||||
ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2));
|
|
||||||
|
|
||||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
|
||||||
|
|
||||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
|
||||||
? tmp_data_1.x : data_1.x;
|
|
||||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.y : data_1.y;
|
|
||||||
|
|
||||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.xy : data_2.xy;
|
|
||||||
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *src2, int src2_step, int src2_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
int rows, int cols, int dst_step1)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#ifdef dst_align
|
|
||||||
#undef dst_align
|
|
||||||
#endif
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
|
||||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
|
||||||
|
|
||||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
|
|
||||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
|
||||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
|
|
||||||
short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
|
|
||||||
short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
|
|
||||||
|
|
||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
|
||||||
|
|
||||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
|
|
||||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
|
|
||||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0));
|
|
||||||
short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1));
|
|
||||||
short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2));
|
|
||||||
|
|
||||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
|
||||||
|
|
||||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
|
||||||
? tmp_data_1.x : data_1.x;
|
|
||||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.y : data_1.y;
|
|
||||||
|
|
||||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.xy : data_2.xy;
|
|
||||||
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *src2, int src2_step, int src2_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
int rows, int cols, int dst_step1)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
|
||||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
|
||||||
|
|
||||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
|
||||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
|
||||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
|
|
||||||
int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
|
|
||||||
int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
|
|
||||||
|
|
||||||
uchar mask_data = * (mask + mask_index);
|
|
||||||
|
|
||||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
|
|
||||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
|
|
||||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0);
|
|
||||||
int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
|
|
||||||
int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
|
|
||||||
|
|
||||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
|
||||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
|
||||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
|
||||||
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *src2, int src2_step, int src2_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
int rows, int cols, int dst_step1)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
|
||||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
|
||||||
|
|
||||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
|
||||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
|
||||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
|
|
||||||
float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
|
|
||||||
float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
|
|
||||||
|
|
||||||
uchar mask_data = * (mask + mask_index);
|
|
||||||
|
|
||||||
float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
|
|
||||||
float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
|
|
||||||
float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
float tmp_data_0 = src1_data_0 + src2_data_0;
|
|
||||||
float tmp_data_1 = src1_data_1 + src2_data_1;
|
|
||||||
float tmp_data_2 = src1_data_2 + src2_data_2;
|
|
||||||
|
|
||||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
|
||||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
|
||||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
|
||||||
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *src2, int src2_step, int src2_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
int rows, int cols, int dst_step1)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
|
||||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
|
||||||
|
|
||||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
|
||||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
|
||||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
|
||||||
|
|
||||||
double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
|
|
||||||
double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
|
|
||||||
double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
|
|
||||||
|
|
||||||
uchar mask_data = * (mask + mask_index);
|
|
||||||
|
|
||||||
double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
|
|
||||||
double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
|
|
||||||
double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
|
|
||||||
|
|
||||||
double tmp_data_0 = src1_data_0 + src2_data_0;
|
|
||||||
double tmp_data_1 = src1_data_1 + src2_data_1;
|
|
||||||
double tmp_data_2 = src1_data_2 + src2_data_2;
|
|
||||||
|
|
||||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
|
||||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
|
||||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
|
||||||
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||||
__global uchar *src2, int src2_step, int src2_offset,
|
__global uchar *src2, int src2_step, int src2_offset,
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
__global uchar *mask, int mask_step, int mask_offset,
|
||||||
@ -1014,7 +704,7 @@ __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
|
|||||||
uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
|
uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
|
||||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||||
|
|
||||||
uchar4 data = convert_uchar4_sat(convert_ushort4_sat(src_data1) + convert_ushort4_sat(src_data2));
|
uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2)));
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||||
@ -1043,7 +733,7 @@ __kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step,
|
|||||||
ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
|
ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
|
||||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2));
|
ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -1072,7 +762,7 @@ __kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, i
|
|||||||
short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
|
short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
|
||||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
short4 data = convert_short4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2));
|
short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -1101,7 +791,7 @@ __kernel void arithm_add_with_mask_C4_D4 (__global int *src1, int src1_step, i
|
|||||||
int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
|
int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
|
||||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src_data2));
|
int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2)));
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -1130,7 +820,7 @@ __kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, i
|
|||||||
float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index));
|
float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index));
|
||||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
float4 data = src_data1 + src_data2;
|
float4 data = ARITHM_OP(src_data1, src_data2);
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global float4 *)((__global char *)dst + dst_index)) = data;
|
*((__global float4 *)((__global char *)dst + dst_index)) = data;
|
||||||
@ -1161,7 +851,7 @@ __kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step,
|
|||||||
double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index));
|
double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index));
|
||||||
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
|
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
|
||||||
|
|
||||||
double4 data = src_data1 + src_data2;
|
double4 data = ARITHM_OP(src_data1, src_data2);
|
||||||
data = mask_data ? data : dst_data;
|
data = mask_data ? data : dst_data;
|
||||||
|
|
||||||
*((__global double4 *)((__global char *)dst + dst_index)) = data;
|
*((__global double4 *)((__global char *)dst + dst_index)) = data;
|
||||||
|
@ -330,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src
|
|||||||
if (x < thread_cols && y < rows)
|
if (x < thread_cols && y < rows)
|
||||||
{
|
{
|
||||||
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
||||||
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
|
|
||||||
|
|
||||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
|
||||||
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
|
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
|
||||||
|
|
||||||
uchar data0 = *(src + src_index_0);
|
uchar data0 = *(src + src_index_0);
|
||||||
uchar data1 = *(src + src_index_1);
|
|
||||||
|
|
||||||
*(dst + dst_index_0) = data1;
|
|
||||||
*(dst + dst_index_1) = data0;
|
*(dst + dst_index_1) = data0;
|
||||||
|
|
||||||
|
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
|
||||||
|
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||||
|
uchar data1 = *(src + src_index_1);
|
||||||
|
*(dst + dst_index_0) = data1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset,
|
__kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset,
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,806 +0,0 @@
|
|||||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//
|
|
||||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
|
||||||
//
|
|
||||||
// By downloading, copying, installing or using the software you agree to this license.
|
|
||||||
// If you do not agree to this license, do not download, install,
|
|
||||||
// copy or use the software.
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// License Agreement
|
|
||||||
// For Open Source Computer Vision Library
|
|
||||||
//
|
|
||||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
|
||||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
|
||||||
// Third party copyrights are property of their respective owners.
|
|
||||||
//
|
|
||||||
// @Authors
|
|
||||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
|
||||||
//
|
|
||||||
// Redistribution and use in source and binary forms, with or without modification,
|
|
||||||
// are permitted provided that the following conditions are met:
|
|
||||||
//
|
|
||||||
// * Redistribution's of source code must retain the above copyright notice,
|
|
||||||
// this list of conditions and the following disclaimer.
|
|
||||||
//
|
|
||||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
|
||||||
// this list of conditions and the following disclaimer in the documentation
|
|
||||||
// and/or other oclMaterials provided with the distribution.
|
|
||||||
//
|
|
||||||
// * The name of the copyright holders may not be used to endorse or promote products
|
|
||||||
// derived from this software without specific prior written permission.
|
|
||||||
//
|
|
||||||
// This software is provided by the copyright holders and contributors as is and
|
|
||||||
// any express or implied warranties, including, but not limited to, the implied
|
|
||||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
|
||||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
|
||||||
// indirect, incidental, special, exemplary, or consequential damages
|
|
||||||
// (including, but not limited to, procurement of substitute goods or services;
|
|
||||||
// loss of use, data, or profits; or business interruption) however caused
|
|
||||||
// and on any theory of liability, whether in contract, strict liability,
|
|
||||||
// or tort (including negligence or otherwise) arising in any way out of
|
|
||||||
// the use of this software, even if advised of the possibility of such damage.
|
|
||||||
//
|
|
||||||
//M*/
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
|
||||||
#endif
|
|
||||||
/**************************************sub with scalar without mask**************************************/
|
|
||||||
__kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 2;
|
|
||||||
|
|
||||||
#define dst_align (dst_offset & 3)
|
|
||||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
|
||||||
|
|
||||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
|
||||||
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
|
|
||||||
|
|
||||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
|
||||||
int4 tmp = convert_int4_sat(src1_data) - src2_data;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
|
||||||
|
|
||||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
|
||||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
|
||||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
|
|
||||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align ((dst_offset >> 1) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
|
||||||
|
|
||||||
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src2_data = (int2)(src2.x, src2.x);
|
|
||||||
|
|
||||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
|
||||||
int2 tmp = convert_int2_sat(src1_data) - src2_data;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
ushort2 tmp_data = convert_ushort2_sat(tmp);
|
|
||||||
|
|
||||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
|
|
||||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
|
|
||||||
|
|
||||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align ((dst_offset >> 1) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
|
||||||
|
|
||||||
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src2_data = (int2)(src2.x, src2.x);
|
|
||||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
|
||||||
|
|
||||||
int2 tmp = convert_int2_sat(src1_data) - src2_data;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
short2 tmp_data = convert_short2_sat(tmp);
|
|
||||||
|
|
||||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
|
|
||||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
|
|
||||||
|
|
||||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
|
|
||||||
int src_data2 = src2.x;
|
|
||||||
|
|
||||||
long tmp = (long)src_data1 - (long)src_data2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
int data = convert_int_sat(tmp);
|
|
||||||
|
|
||||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C1_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
|
|
||||||
float src_data2 = src2.x;
|
|
||||||
|
|
||||||
float tmp = src_data1 - src_data2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
|
|
||||||
*((__global float *)((__global char *)dst + dst_index)) = tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_s_sub_C1_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
|
|
||||||
double src2_data = src2.x;
|
|
||||||
|
|
||||||
double data = src_data1 - src2_data;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
|
|
||||||
*((__global double *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__kernel void arithm_s_sub_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align ((dst_offset >> 1) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
|
||||||
|
|
||||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
|
||||||
int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
|
|
||||||
|
|
||||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
|
||||||
int4 tmp = convert_int4_sat(src1_data) - src2_data;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
|
||||||
|
|
||||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
|
||||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
|
||||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
int2 tmp = convert_int2_sat(src_data1) - src_data2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
ushort2 data = convert_ushort2_sat(tmp);
|
|
||||||
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
|
||||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
int2 tmp = convert_int2_sat(src_data1) - src_data2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
short2 data = convert_short2_sat(tmp);
|
|
||||||
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
|
||||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
int2 data = convert_int2_sat(tmp);
|
|
||||||
|
|
||||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C2_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
|
|
||||||
float2 src_data2 = (float2)(src2.x, src2.y);
|
|
||||||
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
float2 tmp = src_data1 - src_data2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
|
|
||||||
*((__global float2 *)((__global char *)dst + dst_index)) = tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_s_sub_C2_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
|
||||||
|
|
||||||
double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
|
|
||||||
double2 src_data2 = (double2)(src2.x, src2.y);
|
|
||||||
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
double2 data = src_data1 - src_data2;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
|
|
||||||
*((__global double2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
__kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 2;
|
|
||||||
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
|
||||||
|
|
||||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
|
||||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
|
||||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
|
||||||
|
|
||||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
|
||||||
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
|
|
||||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
|
||||||
|
|
||||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
|
||||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
|
||||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
|
|
||||||
|
|
||||||
int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0;
|
|
||||||
int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1;
|
|
||||||
int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0);
|
|
||||||
uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1);
|
|
||||||
uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
|
|
||||||
|
|
||||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
|
||||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
|
||||||
? tmp_data_0.w : data_0.w;
|
|
||||||
|
|
||||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
|
||||||
? tmp_data_1.xy : data_1.xy;
|
|
||||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.zw : data_1.zw;
|
|
||||||
|
|
||||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.x : data_2.x;
|
|
||||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
|
||||||
? tmp_data_2.yzw : data_2.yzw;
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
|
||||||
|
|
||||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
|
|
||||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
|
||||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
|
||||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
|
||||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
|
||||||
|
|
||||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
|
|
||||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
|
|
||||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
|
|
||||||
int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
|
|
||||||
int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0);
|
|
||||||
ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1);
|
|
||||||
ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2);
|
|
||||||
|
|
||||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
|
||||||
|
|
||||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
|
||||||
? tmp_data_1.x : data_1.x;
|
|
||||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.y : data_1.y;
|
|
||||||
|
|
||||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.xy : data_2.xy;
|
|
||||||
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
|
||||||
|
|
||||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
|
|
||||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
|
||||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
|
||||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
|
||||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
|
||||||
|
|
||||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
|
|
||||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
|
|
||||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
|
|
||||||
int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
|
|
||||||
int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
short2 tmp_data_0 = convert_short2_sat(tmp_0);
|
|
||||||
short2 tmp_data_1 = convert_short2_sat(tmp_1);
|
|
||||||
short2 tmp_data_2 = convert_short2_sat(tmp_2);
|
|
||||||
|
|
||||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
|
||||||
|
|
||||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
|
||||||
? tmp_data_1.x : data_1.x;
|
|
||||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.y : data_1.y;
|
|
||||||
|
|
||||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.xy : data_2.xy;
|
|
||||||
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
|
||||||
|
|
||||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
|
||||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
|
||||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
int src2_data_0 = src2.x;
|
|
||||||
int src2_data_1 = src2.y;
|
|
||||||
int src2_data_2 = src2.z;
|
|
||||||
|
|
||||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
|
|
||||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
|
|
||||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
|
|
||||||
long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
|
|
||||||
long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
int tmp_data_0 = convert_int_sat(tmp_0);
|
|
||||||
int tmp_data_1 = convert_int_sat(tmp_1);
|
|
||||||
int tmp_data_2 = convert_int_sat(tmp_2);
|
|
||||||
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C3_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
|
||||||
|
|
||||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
|
||||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
|
||||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
float src2_data_0 = src2.x;
|
|
||||||
float src2_data_1 = src2.y;
|
|
||||||
float src2_data_2 = src2.z;
|
|
||||||
|
|
||||||
float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
|
|
||||||
float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
|
|
||||||
float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
float tmp_0 = src1_data_0 - src2_data_0;
|
|
||||||
float tmp_1 = src1_data_1 - src2_data_1;
|
|
||||||
float tmp_2 = src1_data_2 - src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_0;
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_1;
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_s_sub_C3_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
|
||||||
|
|
||||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
|
||||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
|
||||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
|
||||||
|
|
||||||
double src2_data_0 = src2.x;
|
|
||||||
double src2_data_1 = src2.y;
|
|
||||||
double src2_data_2 = src2.z;
|
|
||||||
|
|
||||||
double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
|
|
||||||
double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
|
|
||||||
double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
|
|
||||||
|
|
||||||
double tmp_data_0 = src1_data_0 - src2_data_0;
|
|
||||||
double tmp_data_1 = src1_data_1 - src2_data_1;
|
|
||||||
double tmp_data_2 = src1_data_2 - src2_data_2;
|
|
||||||
|
|
||||||
tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
|
|
||||||
tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
|
|
||||||
tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
|
|
||||||
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
__kernel void arithm_s_sub_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
|
|
||||||
|
|
||||||
int4 tmp = convert_int4_sat(src_data1) - src2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
uchar4 data = convert_uchar4_sat(tmp);
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
|
|
||||||
|
|
||||||
int4 tmp = convert_int4_sat(src_data1) - src2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
ushort4 data = convert_ushort4_sat(tmp);
|
|
||||||
|
|
||||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
|
|
||||||
|
|
||||||
int4 tmp = convert_int4_sat(src_data1) - src2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
short4 data = convert_short4_sat(tmp);
|
|
||||||
|
|
||||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
|
||||||
|
|
||||||
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
|
|
||||||
|
|
||||||
long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2);
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
int4 data = convert_int4_sat(tmp);
|
|
||||||
|
|
||||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_C4_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
|
||||||
|
|
||||||
float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
|
|
||||||
|
|
||||||
float4 tmp = src_data1 - src2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
|
|
||||||
*((__global float4 *)((__global char *)dst + dst_index)) = tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_s_sub_C4_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset);
|
|
||||||
|
|
||||||
double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
|
|
||||||
|
|
||||||
double4 data = src_data1 - src2;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
|
|
||||||
*((__global double4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,941 +0,0 @@
|
|||||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//
|
|
||||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
|
||||||
//
|
|
||||||
// By downloading, copying, installing or using the software you agree to this license.
|
|
||||||
// If you do not agree to this license, do not download, install,
|
|
||||||
// copy or use the software.
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// License Agreement
|
|
||||||
// For Open Source Computer Vision Library
|
|
||||||
//
|
|
||||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
|
||||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
|
||||||
// Third party copyrights are property of their respective owners.
|
|
||||||
//
|
|
||||||
// @Authors
|
|
||||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
|
||||||
//
|
|
||||||
// Redistribution and use in source and binary forms, with or without modification,
|
|
||||||
// are permitted provided that the following conditions are met:
|
|
||||||
//
|
|
||||||
// * Redistribution's of source code must retain the above copyright notice,
|
|
||||||
// this list of conditions and the following disclaimer.
|
|
||||||
//
|
|
||||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
|
||||||
// this list of conditions and the following disclaimer in the documentation
|
|
||||||
// and/or other GpuMaterials provided with the distribution.
|
|
||||||
//
|
|
||||||
// * The name of the copyright holders may not be used to endorse or promote products
|
|
||||||
// derived from this software without specific prior written permission.
|
|
||||||
//
|
|
||||||
// This software is provided by the copyright holders and contributors as is and
|
|
||||||
// any express or implied warranties, including, but not limited to, the implied
|
|
||||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
|
||||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
|
||||||
// indirect, incidental, special, exemplary, or consequential damages
|
|
||||||
// (including, but not limited to, procurement of substitute goods or services;
|
|
||||||
// loss of use, data, or profits; or business interruption) however caused
|
|
||||||
// and on any theory of liability, whether in contract, strict liability,
|
|
||||||
// or tort (including negligence or otherwise) arising in any way out of
|
|
||||||
// the use of this software, even if advised of the possibility of such damage.
|
|
||||||
//
|
|
||||||
//M*/
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**************************************sub with scalar with mask**************************************/
|
|
||||||
__kernel void arithm_s_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 2;
|
|
||||||
|
|
||||||
#define dst_align (dst_offset & 3)
|
|
||||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
|
||||||
|
|
||||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
|
||||||
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
|
|
||||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
|
||||||
|
|
||||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
|
||||||
int4 tmp = convert_int4_sat(src1_data) - src2_data;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
|
||||||
|
|
||||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
|
||||||
data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
|
||||||
data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
|
|
||||||
data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align ((dst_offset >> 1) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
|
||||||
|
|
||||||
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src2_data = (int2)(src2.x, src2.x);
|
|
||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
|
||||||
|
|
||||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
|
||||||
int2 tmp = convert_int2_sat(src1_data) - src2_data;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
ushort2 tmp_data = convert_ushort2_sat(tmp);
|
|
||||||
|
|
||||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
|
||||||
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
|
|
||||||
|
|
||||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align ((dst_offset >> 1) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
|
||||||
|
|
||||||
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src2_data = (int2)(src2.x, src2.x);
|
|
||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
|
||||||
|
|
||||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
|
||||||
int2 tmp = convert_int2_sat(src1_data) - src2_data;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
short2 tmp_data = convert_short2_sat(tmp);
|
|
||||||
|
|
||||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
|
||||||
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
|
|
||||||
|
|
||||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
|
|
||||||
int src_data2 = src2.x;
|
|
||||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
long tmp = (long)src_data1 - (long)src_data2;
|
|
||||||
tmp = isMatSubScalar ? tmp : - tmp;
|
|
||||||
int data = convert_int_sat(tmp);
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void arithm_s_sub_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
|
|
||||||
float src_data2 = src2.x;
|
|
||||||
float dst_data = *((__global float *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
float data = src_data1 - src_data2;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global float *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_s_sub_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
|
|
||||||
double src_data2 = src2.x;
|
|
||||||
double dst_data = *((__global double *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
double data = src_data1 - src_data2;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global double *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
__kernel void arithm_s_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align ((dst_offset >> 1) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
|
||||||
|
|
||||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
|
||||||
int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
|
|
||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
|
||||||
|
|
||||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
|
||||||
int4 tmp = convert_int4_sat(src1_data) - src2_data;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
|
||||||
|
|
||||||
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
|
|
||||||
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
|
||||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
int2 tmp = convert_int2_sat(src_data1) - src_data2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
ushort2 data = convert_ushort2_sat(tmp);
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
|
||||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
int2 tmp = convert_int2_sat(src_data1) - src_data2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
short2 data = convert_short2_sat(tmp);
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
|
|
||||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
|
||||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
int2 data = convert_int2_sat(tmp);
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
|
|
||||||
float2 src_data2 = (float2)(src2.x, src2.y);
|
|
||||||
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
float2 data = src_data1 - src_data2;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global float2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_s_sub_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
|
|
||||||
double2 src_data2 = (double2)(src2.x, src2.y);
|
|
||||||
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
double2 data = src_data1 - src_data2;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global double2 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
__kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 2;
|
|
||||||
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
|
||||||
|
|
||||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
|
||||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
|
||||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
|
||||||
|
|
||||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
|
||||||
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
|
|
||||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
|
||||||
|
|
||||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
|
||||||
|
|
||||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
|
||||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
|
||||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
|
|
||||||
|
|
||||||
int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0;
|
|
||||||
int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1;
|
|
||||||
int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0);
|
|
||||||
uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1);
|
|
||||||
uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
|
|
||||||
|
|
||||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
|
||||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
|
||||||
? tmp_data_0.w : data_0.w;
|
|
||||||
|
|
||||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
|
||||||
? tmp_data_1.xy : data_1.xy;
|
|
||||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.zw : data_1.zw;
|
|
||||||
|
|
||||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.x : data_2.x;
|
|
||||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
|
||||||
? tmp_data_2.yzw : data_2.yzw;
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
|
|
||||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
|
||||||
|
|
||||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
|
|
||||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
|
||||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
|
||||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
|
||||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
|
||||||
|
|
||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
|
||||||
|
|
||||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
|
|
||||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
|
|
||||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
|
|
||||||
int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
|
|
||||||
int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0);
|
|
||||||
ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1);
|
|
||||||
ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2);
|
|
||||||
|
|
||||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
|
||||||
|
|
||||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
|
||||||
? tmp_data_1.x : data_1.x;
|
|
||||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.y : data_1.y;
|
|
||||||
|
|
||||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.xy : data_2.xy;
|
|
||||||
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
x = x << 1;
|
|
||||||
|
|
||||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
|
||||||
|
|
||||||
int dst_start = mad24(y, dst_step, dst_offset);
|
|
||||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
|
||||||
|
|
||||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
|
|
||||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
|
||||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
|
||||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
|
||||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
|
||||||
|
|
||||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
|
||||||
|
|
||||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
|
|
||||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
|
|
||||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
|
|
||||||
int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
|
|
||||||
int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
short2 tmp_data_0 = convert_short2_sat(tmp_0);
|
|
||||||
short2 tmp_data_1 = convert_short2_sat(tmp_1);
|
|
||||||
short2 tmp_data_2 = convert_short2_sat(tmp_2);
|
|
||||||
|
|
||||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
|
||||||
|
|
||||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
|
||||||
? tmp_data_1.x : data_1.x;
|
|
||||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_1.y : data_1.y;
|
|
||||||
|
|
||||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
|
||||||
? tmp_data_2.xy : data_2.xy;
|
|
||||||
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
|
||||||
|
|
||||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
|
||||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
|
||||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
int src2_data_0 = src2.x;
|
|
||||||
int src2_data_1 = src2.y;
|
|
||||||
int src2_data_2 = src2.z;
|
|
||||||
|
|
||||||
uchar mask_data = * (mask + mask_index);
|
|
||||||
|
|
||||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
|
|
||||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
|
|
||||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
|
|
||||||
long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
|
|
||||||
long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
|
|
||||||
|
|
||||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
|
||||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
|
||||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
|
||||||
|
|
||||||
int tmp_data_0 = convert_int_sat(tmp_0);
|
|
||||||
int tmp_data_1 = convert_int_sat(tmp_1);
|
|
||||||
int tmp_data_2 = convert_int_sat(tmp_2);
|
|
||||||
|
|
||||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
|
||||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
|
||||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
|
||||||
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
|
||||||
|
|
||||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
|
||||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
|
||||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
|
||||||
|
|
||||||
float src2_data_0 = src2.x;
|
|
||||||
float src2_data_1 = src2.y;
|
|
||||||
float src2_data_2 = src2.z;
|
|
||||||
|
|
||||||
uchar mask_data = * (mask + mask_index);
|
|
||||||
|
|
||||||
float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
|
|
||||||
float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
|
|
||||||
float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
|
|
||||||
|
|
||||||
float tmp_data_0 = src1_data_0 - src2_data_0;
|
|
||||||
float tmp_data_1 = src1_data_1 - src2_data_1;
|
|
||||||
float tmp_data_2 = src1_data_2 - src2_data_2;
|
|
||||||
|
|
||||||
tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
|
|
||||||
tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
|
|
||||||
tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
|
|
||||||
|
|
||||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
|
||||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
|
||||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
|
||||||
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
|
|
||||||
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_s_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
|
||||||
|
|
||||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
|
||||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
|
||||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
|
||||||
|
|
||||||
double src2_data_0 = src2.x;
|
|
||||||
double src2_data_1 = src2.y;
|
|
||||||
double src2_data_2 = src2.z;
|
|
||||||
|
|
||||||
uchar mask_data = * (mask + mask_index);
|
|
||||||
|
|
||||||
double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
|
|
||||||
double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
|
|
||||||
double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
|
|
||||||
|
|
||||||
double tmp_data_0 = src1_data_0 - src2_data_0;
|
|
||||||
double tmp_data_1 = src1_data_1 - src2_data_1;
|
|
||||||
double tmp_data_2 = src1_data_2 - src2_data_2;
|
|
||||||
|
|
||||||
tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
|
|
||||||
tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
|
|
||||||
tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
|
|
||||||
|
|
||||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
|
||||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
|
||||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
|
||||||
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
|
|
||||||
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
__kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
|
||||||
__global uchar *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
|
|
||||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
|
||||||
|
|
||||||
int4 tmp = convert_int4_sat(src_data1) - src2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
uchar4 data = convert_uchar4_sat(tmp);
|
|
||||||
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
|
||||||
__global ushort *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
|
|
||||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
int4 tmp = convert_int4_sat(src_data1) - src2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
ushort4 data = convert_ushort4_sat(tmp);
|
|
||||||
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
|
||||||
__global short *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
|
|
||||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
int4 tmp = convert_int4_sat(src_data1) - src2;
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
short4 data = convert_short4_sat(tmp);
|
|
||||||
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
|
||||||
__global int *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
|
|
||||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2);
|
|
||||||
tmp = isMatSubScalar ? tmp : -tmp;
|
|
||||||
int4 data = convert_int4_sat(tmp);
|
|
||||||
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__kernel void arithm_s_sub_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset,
|
|
||||||
__global float *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
|
|
||||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
float4 data = src_data1 - src2;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global float4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
|
||||||
__kernel void arithm_s_sub_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset,
|
|
||||||
__global double *dst, int dst_step, int dst_offset,
|
|
||||||
__global uchar *mask, int mask_step, int mask_offset,
|
|
||||||
double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
|
|
||||||
{
|
|
||||||
|
|
||||||
int x = get_global_id(0);
|
|
||||||
int y = get_global_id(1);
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
|
||||||
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
|
|
||||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
|
||||||
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset);
|
|
||||||
|
|
||||||
uchar mask_data = *(mask + mask_index);
|
|
||||||
|
|
||||||
double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
|
|
||||||
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
|
|
||||||
|
|
||||||
double4 data = src_data1 - src2;
|
|
||||||
data = isMatSubScalar ? data : -data;
|
|
||||||
data = mask_data ? data : dst_data;
|
|
||||||
|
|
||||||
*((__global double4 *)((__global char *)dst + dst_index)) = data;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,5 +1,58 @@
|
|||||||
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||||
|
//
|
||||||
|
// By downloading, copying, installing or using the software you agree to this license.
|
||||||
|
// If you do not agree to this license, do not download, install,
|
||||||
|
// copy or use the software.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// License Agreement
|
||||||
|
// For Open Source Computer Vision Library
|
||||||
|
//
|
||||||
|
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||||
|
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||||
|
// Third party copyrights are property of their respective owners.
|
||||||
|
//
|
||||||
|
// @Authors
|
||||||
|
// Nathan, liujun@multicorewareinc.com
|
||||||
|
// Peng Xiao, pengxiao@outlook.com
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
// are permitted provided that the following conditions are met:
|
||||||
|
//
|
||||||
|
// * Redistribution's of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other oclMaterials provided with the distribution.
|
||||||
|
//
|
||||||
|
// * The name of the copyright holders may not be used to endorse or promote products
|
||||||
|
// derived from this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// This software is provided by the copyright holders and contributors "as is" and
|
||||||
|
// any express or implied warranties, including, but not limited to, the implied
|
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||||
|
// indirect, incidental, special, exemplary, or consequential damages
|
||||||
|
// (including, but not limited to, procurement of substitute goods or services;
|
||||||
|
// loss of use, data, or profits; or business interruption) however caused
|
||||||
|
// and on any theory of liability, whether in contract, strict liability,
|
||||||
|
// or tort (including negligence or otherwise) arising in any way out of
|
||||||
|
// the use of this software, even if advised of the possibility of such damage.
|
||||||
|
//
|
||||||
|
//M*/
|
||||||
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||||
#define MAX_FLOAT 1e7f
|
#define MAX_FLOAT 3.40282e+038f
|
||||||
|
|
||||||
|
#ifndef BLOCK_SIZE
|
||||||
|
#define BLOCK_SIZE 16
|
||||||
|
#endif
|
||||||
|
#ifndef MAX_DESC_LEN
|
||||||
|
#define MAX_DESC_LEN 64
|
||||||
|
#endif
|
||||||
|
|
||||||
int bit1Count(float x)
|
int bit1Count(float x)
|
||||||
{
|
{
|
||||||
@ -13,83 +66,52 @@ int bit1Count(float x)
|
|||||||
return (float)c;
|
return (float)c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef DIST_TYPE
|
||||||
|
#define DIST_TYPE 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (DIST_TYPE == 0)
|
||||||
|
#define DIST(x, y) fabs((x) - (y))
|
||||||
|
#elif (DIST_TYPE == 1)
|
||||||
|
#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
|
||||||
|
#elif (DIST_TYPE == 2)
|
||||||
|
#define DIST(x, y) bit1Count((uint)(x) ^ (uint)(y))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
float reduce_block(__local float *s_query,
|
float reduce_block(__local float *s_query,
|
||||||
__local float *s_train,
|
__local float *s_train,
|
||||||
int block_size,
|
|
||||||
int lidx,
|
int lidx,
|
||||||
int lidy,
|
int lidy
|
||||||
int distType
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
|
||||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
|
||||||
float result = 0;
|
float result = 0;
|
||||||
switch(distType)
|
#pragma unroll
|
||||||
|
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||||
{
|
{
|
||||||
case 0:
|
result += DIST(s_query[lidy * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
|
||||||
{
|
|
||||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 1:
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
|
||||||
{
|
|
||||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
|
||||||
result += qr * qr;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
|
||||||
{
|
|
||||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
float reduce_multi_block(__local float *s_query,
|
float reduce_multi_block(__local float *s_query,
|
||||||
__local float *s_train,
|
__local float *s_train,
|
||||||
int max_desc_len,
|
|
||||||
int block_size,
|
|
||||||
int block_index,
|
int block_index,
|
||||||
int lidx,
|
int lidx,
|
||||||
int lidy,
|
int lidy
|
||||||
int distType
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
|
||||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
|
||||||
float result = 0;
|
float result = 0;
|
||||||
switch(distType)
|
#pragma unroll
|
||||||
|
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||||
{
|
{
|
||||||
case 0:
|
result += DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
|
||||||
{
|
|
||||||
result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 1:
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
|
||||||
{
|
|
||||||
float qr = s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx];
|
|
||||||
result += qr * qr;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
|
||||||
{
|
|
||||||
//result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
|
||||||
result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
|
/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
|
||||||
local size: dim0 is block_size, dim1 is block_size.
|
local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
|
||||||
*/
|
*/
|
||||||
__kernel void BruteForceMatch_UnrollMatch_D5(
|
__kernel void BruteForceMatch_UnrollMatch_D5(
|
||||||
__global float *query,
|
__global float *query,
|
||||||
@ -98,29 +120,28 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
|||||||
__global int *bestTrainIdx,
|
__global int *bestTrainIdx,
|
||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
|
||||||
int max_desc_len,
|
|
||||||
int query_rows,
|
int query_rows,
|
||||||
int query_cols,
|
int query_cols,
|
||||||
int train_rows,
|
int train_rows,
|
||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step
|
||||||
int distType
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
const int groupidx = get_group_id(0);
|
const int groupidx = get_group_id(0);
|
||||||
|
|
||||||
__local float *s_query = sharebuffer;
|
__local float *s_query = sharebuffer;
|
||||||
__local float *s_train = sharebuffer + block_size * max_desc_len;
|
__local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||||
|
|
||||||
int queryIdx = groupidx * block_size + lidy;
|
int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||||
// load the query into local memory.
|
// load the query into local memory.
|
||||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
#pragma unroll
|
||||||
|
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
|
||||||
{
|
{
|
||||||
int loadx = lidx + i * block_size;
|
int loadx = lidx + i * BLOCK_SIZE;
|
||||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
float myBestDistance = MAX_FLOAT;
|
float myBestDistance = MAX_FLOAT;
|
||||||
@ -128,24 +149,25 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
|||||||
|
|
||||||
// loopUnrolledCached to find the best trainIdx and best distance.
|
// loopUnrolledCached to find the best trainIdx and best distance.
|
||||||
volatile int imgIdx = 0;
|
volatile int imgIdx = 0;
|
||||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
|
||||||
{
|
{
|
||||||
float result = 0;
|
float result = 0;
|
||||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
#pragma unroll
|
||||||
|
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
|
||||||
{
|
{
|
||||||
//load a block_size * block_size block into local train.
|
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||||
const int loadx = lidx + i * block_size;
|
const int loadx = lidx + i * BLOCK_SIZE;
|
||||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||||
|
|
||||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);
|
result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
int trainIdx = t * block_size + lidx;
|
int trainIdx = t * BLOCK_SIZE + lidx;
|
||||||
|
|
||||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
|
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
|
||||||
{
|
{
|
||||||
@ -157,18 +179,19 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
|||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
__local float *s_distance = (__local float*)(sharebuffer);
|
__local float *s_distance = (__local float*)(sharebuffer);
|
||||||
__local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
__local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||||
|
|
||||||
//find BestMatch
|
//find BestMatch
|
||||||
s_distance += lidy * block_size;
|
s_distance += lidy * BLOCK_SIZE;
|
||||||
s_trainIdx += lidy * block_size;
|
s_trainIdx += lidy * BLOCK_SIZE;
|
||||||
s_distance[lidx] = myBestDistance;
|
s_distance[lidx] = myBestDistance;
|
||||||
s_trainIdx[lidx] = myBestTrainIdx;
|
s_trainIdx[lidx] = myBestTrainIdx;
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
//reduce -- now all reduce implement in each threads.
|
//reduce -- now all reduce implement in each threads.
|
||||||
for (int k = 0 ; k < block_size; k++)
|
#pragma unroll
|
||||||
|
for (int k = 0 ; k < BLOCK_SIZE; k++)
|
||||||
{
|
{
|
||||||
if (myBestDistance > s_distance[k])
|
if (myBestDistance > s_distance[k])
|
||||||
{
|
{
|
||||||
@ -191,53 +214,51 @@ __kernel void BruteForceMatch_Match_D5(
|
|||||||
__global int *bestTrainIdx,
|
__global int *bestTrainIdx,
|
||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
|
||||||
int query_rows,
|
int query_rows,
|
||||||
int query_cols,
|
int query_cols,
|
||||||
int train_rows,
|
int train_rows,
|
||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step
|
||||||
int distType
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
const int groupidx = get_group_id(0);
|
const int groupidx = get_group_id(0);
|
||||||
|
|
||||||
const int queryIdx = groupidx * block_size + lidy;
|
const int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||||
|
|
||||||
float myBestDistance = MAX_FLOAT;
|
float myBestDistance = MAX_FLOAT;
|
||||||
int myBestTrainIdx = -1;
|
int myBestTrainIdx = -1;
|
||||||
|
|
||||||
__local float *s_query = sharebuffer;
|
__local float *s_query = sharebuffer;
|
||||||
__local float *s_train = sharebuffer + block_size * block_size;
|
__local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||||
|
|
||||||
// loop
|
// loop
|
||||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||||
{
|
{
|
||||||
//Dist dist;
|
//Dist dist;
|
||||||
float result = 0;
|
float result = 0;
|
||||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
|
||||||
{
|
{
|
||||||
const int loadx = lidx + i * block_size;
|
const int loadx = lidx + i * BLOCK_SIZE;
|
||||||
//load query and train into local memory
|
//load query and train into local memory
|
||||||
s_query[lidy * block_size + lidx] = 0;
|
s_query[lidy * BLOCK_SIZE + lidx] = 0;
|
||||||
s_train[lidx * block_size + lidy] = 0;
|
s_train[lidx * BLOCK_SIZE + lidy] = 0;
|
||||||
|
|
||||||
if (loadx < query_cols)
|
if (loadx < query_cols)
|
||||||
{
|
{
|
||||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
|
result += reduce_block(s_query, s_train, lidx, lidy);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int trainIdx = t * block_size + lidx;
|
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||||
|
|
||||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
|
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
|
||||||
{
|
{
|
||||||
@ -250,18 +271,18 @@ __kernel void BruteForceMatch_Match_D5(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
__local float *s_distance = (__local float *)sharebuffer;
|
__local float *s_distance = (__local float *)sharebuffer;
|
||||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||||
|
|
||||||
//findBestMatch
|
//findBestMatch
|
||||||
s_distance += lidy * block_size;
|
s_distance += lidy * BLOCK_SIZE;
|
||||||
s_trainIdx += lidy * block_size;
|
s_trainIdx += lidy * BLOCK_SIZE;
|
||||||
s_distance[lidx] = myBestDistance;
|
s_distance[lidx] = myBestDistance;
|
||||||
s_trainIdx[lidx] = myBestTrainIdx;
|
s_trainIdx[lidx] = myBestTrainIdx;
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
//reduce -- now all reduce implement in each threads.
|
//reduce -- now all reduce implement in each threads.
|
||||||
for (int k = 0 ; k < block_size; k++)
|
for (int k = 0 ; k < BLOCK_SIZE; k++)
|
||||||
{
|
{
|
||||||
if (myBestDistance > s_distance[k])
|
if (myBestDistance > s_distance[k])
|
||||||
{
|
{
|
||||||
@ -287,16 +308,13 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
|
|||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
__global int *nMatches,
|
__global int *nMatches,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
|
||||||
int max_desc_len,
|
|
||||||
int query_rows,
|
int query_rows,
|
||||||
int query_cols,
|
int query_cols,
|
||||||
int train_rows,
|
int train_rows,
|
||||||
int train_cols,
|
int train_cols,
|
||||||
int bestTrainIdx_cols,
|
int bestTrainIdx_cols,
|
||||||
int step,
|
int step,
|
||||||
int ostep,
|
int ostep
|
||||||
int distType
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
@ -304,25 +322,25 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
|
|||||||
const int groupidx = get_group_id(0);
|
const int groupidx = get_group_id(0);
|
||||||
const int groupidy = get_group_id(1);
|
const int groupidy = get_group_id(1);
|
||||||
|
|
||||||
const int queryIdx = groupidy * block_size + lidy;
|
const int queryIdx = groupidy * BLOCK_SIZE + lidy;
|
||||||
const int trainIdx = groupidx * block_size + lidx;
|
const int trainIdx = groupidx * BLOCK_SIZE + lidx;
|
||||||
|
|
||||||
__local float *s_query = sharebuffer;
|
__local float *s_query = sharebuffer;
|
||||||
__local float *s_train = sharebuffer + block_size * block_size;
|
__local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||||
|
|
||||||
float result = 0;
|
float result = 0;
|
||||||
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
|
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
|
||||||
{
|
{
|
||||||
//load a block_size * block_size block into local train.
|
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||||
const int loadx = lidx + i * block_size;
|
const int loadx = lidx + i * BLOCK_SIZE;
|
||||||
|
|
||||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||||
|
|
||||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
|
result += reduce_block(s_query, s_train, lidx, lidy);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
@ -350,15 +368,13 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
|
|||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
__global int *nMatches,
|
__global int *nMatches,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
|
||||||
int query_rows,
|
int query_rows,
|
||||||
int query_cols,
|
int query_cols,
|
||||||
int train_rows,
|
int train_rows,
|
||||||
int train_cols,
|
int train_cols,
|
||||||
int bestTrainIdx_cols,
|
int bestTrainIdx_cols,
|
||||||
int step,
|
int step,
|
||||||
int ostep,
|
int ostep
|
||||||
int distType
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
@ -366,25 +382,25 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
|
|||||||
const int groupidx = get_group_id(0);
|
const int groupidx = get_group_id(0);
|
||||||
const int groupidy = get_group_id(1);
|
const int groupidy = get_group_id(1);
|
||||||
|
|
||||||
const int queryIdx = groupidy * block_size + lidy;
|
const int queryIdx = groupidy * BLOCK_SIZE + lidy;
|
||||||
const int trainIdx = groupidx * block_size + lidx;
|
const int trainIdx = groupidx * BLOCK_SIZE + lidx;
|
||||||
|
|
||||||
__local float *s_query = sharebuffer;
|
__local float *s_query = sharebuffer;
|
||||||
__local float *s_train = sharebuffer + block_size * block_size;
|
__local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||||
|
|
||||||
float result = 0;
|
float result = 0;
|
||||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
|
for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
|
||||||
{
|
{
|
||||||
//load a block_size * block_size block into local train.
|
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||||
const int loadx = lidx + i * block_size;
|
const int loadx = lidx + i * BLOCK_SIZE;
|
||||||
|
|
||||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||||
|
|
||||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
|
result += reduce_block(s_query, s_train, lidx, lidy);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
@ -410,29 +426,26 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
|||||||
__global int2 *bestTrainIdx,
|
__global int2 *bestTrainIdx,
|
||||||
__global float2 *bestDistance,
|
__global float2 *bestDistance,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
|
||||||
int max_desc_len,
|
|
||||||
int query_rows,
|
int query_rows,
|
||||||
int query_cols,
|
int query_cols,
|
||||||
int train_rows,
|
int train_rows,
|
||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step
|
||||||
int distType
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
const int groupidx = get_group_id(0);
|
const int groupidx = get_group_id(0);
|
||||||
|
|
||||||
const int queryIdx = groupidx * block_size + lidy;
|
const int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||||
local float *s_query = sharebuffer;
|
local float *s_query = sharebuffer;
|
||||||
local float *s_train = sharebuffer + block_size * max_desc_len;
|
local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||||
|
|
||||||
// load the query into local memory.
|
// load the query into local memory.
|
||||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
|
||||||
{
|
{
|
||||||
int loadx = lidx + i * block_size;
|
int loadx = lidx + i * BLOCK_SIZE;
|
||||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
float myBestDistance1 = MAX_FLOAT;
|
float myBestDistance1 = MAX_FLOAT;
|
||||||
@ -442,25 +455,25 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
|||||||
|
|
||||||
//loopUnrolledCached
|
//loopUnrolledCached
|
||||||
volatile int imgIdx = 0;
|
volatile int imgIdx = 0;
|
||||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||||
{
|
{
|
||||||
float result = 0;
|
float result = 0;
|
||||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
|
||||||
{
|
{
|
||||||
const int loadX = lidx + i * block_size;
|
const int loadX = lidx + i * BLOCK_SIZE;
|
||||||
//load a block_size * block_size block into local train.
|
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||||
const int loadx = lidx + i * block_size;
|
const int loadx = lidx + i * BLOCK_SIZE;
|
||||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||||
|
|
||||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);
|
result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int trainIdx = t * block_size + lidx;
|
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||||
|
|
||||||
if (queryIdx < query_rows && trainIdx < train_rows)
|
if (queryIdx < query_rows && trainIdx < train_rows)
|
||||||
{
|
{
|
||||||
@ -482,11 +495,11 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
local float *s_distance = (local float *)sharebuffer;
|
local float *s_distance = (local float *)sharebuffer;
|
||||||
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
|
local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||||
|
|
||||||
// find BestMatch
|
// find BestMatch
|
||||||
s_distance += lidy * block_size;
|
s_distance += lidy * BLOCK_SIZE;
|
||||||
s_trainIdx += lidy * block_size;
|
s_trainIdx += lidy * BLOCK_SIZE;
|
||||||
|
|
||||||
s_distance[lidx] = myBestDistance1;
|
s_distance[lidx] = myBestDistance1;
|
||||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||||
@ -499,7 +512,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
|||||||
|
|
||||||
if (lidx == 0)
|
if (lidx == 0)
|
||||||
{
|
{
|
||||||
for (int i = 0 ; i < block_size ; i++)
|
for (int i = 0 ; i < BLOCK_SIZE ; i++)
|
||||||
{
|
{
|
||||||
float val = s_distance[i];
|
float val = s_distance[i];
|
||||||
if (val < bestDistance1)
|
if (val < bestDistance1)
|
||||||
@ -527,7 +540,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
|||||||
|
|
||||||
if (lidx == 0)
|
if (lidx == 0)
|
||||||
{
|
{
|
||||||
for (int i = 0 ; i < block_size ; i++)
|
for (int i = 0 ; i < BLOCK_SIZE ; i++)
|
||||||
{
|
{
|
||||||
float val = s_distance[i];
|
float val = s_distance[i];
|
||||||
|
|
||||||
@ -559,22 +572,20 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
|||||||
__global int2 *bestTrainIdx,
|
__global int2 *bestTrainIdx,
|
||||||
__global float2 *bestDistance,
|
__global float2 *bestDistance,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
|
||||||
int query_rows,
|
int query_rows,
|
||||||
int query_cols,
|
int query_cols,
|
||||||
int train_rows,
|
int train_rows,
|
||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step
|
||||||
int distType
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
const int groupidx = get_group_id(0);
|
const int groupidx = get_group_id(0);
|
||||||
|
|
||||||
const int queryIdx = groupidx * block_size + lidy;
|
const int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||||
local float *s_query = sharebuffer;
|
local float *s_query = sharebuffer;
|
||||||
local float *s_train = sharebuffer + block_size * block_size;
|
local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||||
|
|
||||||
float myBestDistance1 = MAX_FLOAT;
|
float myBestDistance1 = MAX_FLOAT;
|
||||||
float myBestDistance2 = MAX_FLOAT;
|
float myBestDistance2 = MAX_FLOAT;
|
||||||
@ -582,30 +593,30 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
|||||||
int myBestTrainIdx2 = -1;
|
int myBestTrainIdx2 = -1;
|
||||||
|
|
||||||
//loop
|
//loop
|
||||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||||
{
|
{
|
||||||
float result = 0.0f;
|
float result = 0.0f;
|
||||||
for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
|
for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
|
||||||
{
|
{
|
||||||
const int loadx = lidx + i * block_size;
|
const int loadx = lidx + i * BLOCK_SIZE;
|
||||||
//load query and train into local memory
|
//load query and train into local memory
|
||||||
s_query[lidy * block_size + lidx] = 0;
|
s_query[lidy * BLOCK_SIZE + lidx] = 0;
|
||||||
s_train[lidx * block_size + lidy] = 0;
|
s_train[lidx * BLOCK_SIZE + lidy] = 0;
|
||||||
|
|
||||||
if (loadx < query_cols)
|
if (loadx < query_cols)
|
||||||
{
|
{
|
||||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
|
result += reduce_block(s_query, s_train, lidx, lidy);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int trainIdx = t * block_size + lidx;
|
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||||
|
|
||||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
|
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
|
||||||
{
|
{
|
||||||
@ -627,11 +638,11 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
__local float *s_distance = (__local float *)sharebuffer;
|
__local float *s_distance = (__local float *)sharebuffer;
|
||||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||||
|
|
||||||
//findBestMatch
|
//findBestMatch
|
||||||
s_distance += lidy * block_size;
|
s_distance += lidy * BLOCK_SIZE;
|
||||||
s_trainIdx += lidy * block_size;
|
s_trainIdx += lidy * BLOCK_SIZE;
|
||||||
|
|
||||||
s_distance[lidx] = myBestDistance1;
|
s_distance[lidx] = myBestDistance1;
|
||||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||||
@ -644,7 +655,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
|||||||
|
|
||||||
if (lidx == 0)
|
if (lidx == 0)
|
||||||
{
|
{
|
||||||
for (int i = 0 ; i < block_size ; i++)
|
for (int i = 0 ; i < BLOCK_SIZE ; i++)
|
||||||
{
|
{
|
||||||
float val = s_distance[i];
|
float val = s_distance[i];
|
||||||
if (val < bestDistance1)
|
if (val < bestDistance1)
|
||||||
@ -672,7 +683,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
|||||||
|
|
||||||
if (lidx == 0)
|
if (lidx == 0)
|
||||||
{
|
{
|
||||||
for (int i = 0 ; i < block_size ; i++)
|
for (int i = 0 ; i < BLOCK_SIZE ; i++)
|
||||||
{
|
{
|
||||||
float val = s_distance[i];
|
float val = s_distance[i];
|
||||||
|
|
||||||
@ -703,14 +714,11 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5(
|
|||||||
//__global float *mask,
|
//__global float *mask,
|
||||||
__global float *allDist,
|
__global float *allDist,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
|
||||||
int max_desc_len,
|
|
||||||
int query_rows,
|
int query_rows,
|
||||||
int query_cols,
|
int query_cols,
|
||||||
int train_rows,
|
int train_rows,
|
||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step)
|
||||||
int distType)
|
|
||||||
{
|
{
|
||||||
/* Todo */
|
/* Todo */
|
||||||
}
|
}
|
||||||
@ -721,13 +729,11 @@ kernel void BruteForceMatch_calcDistance_D5(
|
|||||||
//__global float *mask,
|
//__global float *mask,
|
||||||
__global float *allDist,
|
__global float *allDist,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
|
||||||
int query_rows,
|
int query_rows,
|
||||||
int query_cols,
|
int query_cols,
|
||||||
int train_rows,
|
int train_rows,
|
||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step)
|
||||||
int distType)
|
|
||||||
{
|
{
|
||||||
/* Todo */
|
/* Todo */
|
||||||
}
|
}
|
||||||
@ -736,8 +742,7 @@ kernel void BruteForceMatch_findBestMatch_D5(
|
|||||||
__global float *allDist,
|
__global float *allDist,
|
||||||
__global int *bestTrainIdx,
|
__global int *bestTrainIdx,
|
||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
int k,
|
int k
|
||||||
int block_size
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
/* Todo */
|
/* Todo */
|
||||||
|
@ -339,8 +339,8 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
|||||||
//judge if read out of boundary
|
//judge if read out of boundary
|
||||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||||
{
|
{
|
||||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
|
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(float)0,temp[i]);
|
||||||
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
|
temp[i]= ELEM(start_y,0,src_whole_rows,(float)0,temp[i]);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
int index[READ_TIMES_ROW];
|
int index[READ_TIMES_ROW];
|
||||||
@ -422,8 +422,8 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
|||||||
//judge if read out of boundary
|
//judge if read out of boundary
|
||||||
for(i = 0; i<READ_TIMES_ROW; i++)
|
for(i = 0; i<READ_TIMES_ROW; i++)
|
||||||
{
|
{
|
||||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
|
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(float4)0,temp[i]);
|
||||||
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
|
temp[i]= ELEM(start_y,0,src_whole_rows,(float4)0,temp[i]);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
int index[READ_TIMES_ROW];
|
int index[READ_TIMES_ROW];
|
||||||
@ -465,4 +465,5 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
|||||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||||
dst[start_addr] = sum;
|
dst[start_addr] = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -304,7 +304,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
|||||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||||
|
|
||||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||||
sum = sum + (mat_kernel[i * ANCHOR + j] * data);
|
sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -522,7 +522,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
|||||||
for(int j = 0; j < ANCHOR; j++)
|
for(int j = 0; j < ANCHOR; j++)
|
||||||
{
|
{
|
||||||
int local_cols = lX + j;
|
int local_cols = lX + j;
|
||||||
sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols];
|
sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,7 +44,11 @@
|
|||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
#if defined (DOUBLE_SUPPORT)
|
||||||
|
#ifdef cl_khr_fp64
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||||
|
#elif defined (cl_amd_fp64)
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#define LSIZE 256
|
#define LSIZE 256
|
||||||
#define LSIZE_1 255
|
#define LSIZE_1 255
|
||||||
@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
|||||||
gid = gid << 1;
|
gid = gid << 1;
|
||||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||||
{
|
{
|
||||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
|
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
|
||||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
|
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
|
||||||
|
|
||||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||||
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||||
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||||
@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
|||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
||||||
if(lid > 0 && (i+lid) <= rows){
|
if(lid > 0 && (i+lid) <= rows)
|
||||||
|
{
|
||||||
lm_sum[0][bf_loc] += sum_t[0];
|
lm_sum[0][bf_loc] += sum_t[0];
|
||||||
lm_sum[1][bf_loc] += sum_t[1];
|
lm_sum[1][bf_loc] += sum_t[1];
|
||||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||||
@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
|||||||
src_step = src_step >> 4;
|
src_step = src_step >> 4;
|
||||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||||
{
|
{
|
||||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
|
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0;
|
||||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0;
|
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
|
||||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
|
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0;
|
||||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0;
|
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
|
||||||
|
|
||||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||||
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||||
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||||
@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
|||||||
}
|
}
|
||||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||||
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
|
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
|
||||||
if(lid > 0 && (i+lid) <= rows){
|
if(lid > 0 && (i+lid) <= rows)
|
||||||
|
{
|
||||||
lm_sum[0][bf_loc] += sum_t[0];
|
lm_sum[0][bf_loc] += sum_t[0];
|
||||||
lm_sum[1][bf_loc] += sum_t[1];
|
lm_sum[1][bf_loc] += sum_t[1];
|
||||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||||
|
@ -47,8 +47,12 @@
|
|||||||
//warpAffine kernel
|
//warpAffine kernel
|
||||||
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
|
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
|
||||||
|
|
||||||
#if defined DOUBLE_SUPPORT
|
#if defined (DOUBLE_SUPPORT)
|
||||||
|
#ifdef cl_khr_fp64
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||||
|
#elif defined (cl_amd_fp64)
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||||
|
#endif
|
||||||
typedef double F;
|
typedef double F;
|
||||||
typedef double4 F4;
|
typedef double4 F4;
|
||||||
#define convert_F4 convert_double4
|
#define convert_F4 convert_double4
|
||||||
@ -58,7 +62,6 @@ typedef float4 F4;
|
|||||||
#define convert_F4 convert_float4
|
#define convert_F4 convert_float4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define INTER_BITS 5
|
#define INTER_BITS 5
|
||||||
#define INTER_TAB_SIZE (1 << INTER_BITS)
|
#define INTER_TAB_SIZE (1 << INTER_BITS)
|
||||||
#define INTER_SCALE 1.f/INTER_TAB_SIZE
|
#define INTER_SCALE 1.f/INTER_TAB_SIZE
|
||||||
@ -123,7 +126,7 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
|
|||||||
sval.s1 = scon.s1 ? src[spos.s1] : 0;
|
sval.s1 = scon.s1 ? src[spos.s1] : 0;
|
||||||
sval.s2 = scon.s2 ? src[spos.s2] : 0;
|
sval.s2 = scon.s2 ? src[spos.s2] : 0;
|
||||||
sval.s3 = scon.s3 ? src[spos.s3] : 0;
|
sval.s3 = scon.s3 ? src[spos.s3] : 0;
|
||||||
dval = convert_uchar4(dcon != 0) ? sval : dval;
|
dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
|
||||||
*d = dval;
|
*d = dval;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,10 +209,10 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
|
|||||||
taby = INTER_SCALE * convert_float4(ay);
|
taby = INTER_SCALE * convert_float4(ay);
|
||||||
tabx = INTER_SCALE * convert_float4(ax);
|
tabx = INTER_SCALE * convert_float4(ax);
|
||||||
|
|
||||||
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
|
||||||
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
|
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE ));
|
||||||
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
|
||||||
itab3 = convert_short4_sat(( taby*tabx * INTER_REMAP_COEF_SCALE ));
|
itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE ));
|
||||||
|
|
||||||
|
|
||||||
int4 val;
|
int4 val;
|
||||||
@ -636,7 +639,7 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i
|
|||||||
short sy0 = (short)(Y0 >> AB_BITS);
|
short sy0 = (short)(Y0 >> AB_BITS);
|
||||||
|
|
||||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0;
|
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -670,10 +673,10 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
|
|||||||
|
|
||||||
float4 v0, v1, v2, v3;
|
float4 v0, v1, v2, v3;
|
||||||
|
|
||||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
|
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
|
||||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
|
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
|
||||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
|
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
|
||||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
|
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
|
||||||
|
|
||||||
float tab[4];
|
float tab[4];
|
||||||
float taby[2], tabx[2];
|
float taby[2], tabx[2];
|
||||||
@ -726,7 +729,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
|
|||||||
int i;
|
int i;
|
||||||
|
|
||||||
for(i=0; i<16; i++)
|
for(i=0; i<16; i++)
|
||||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
|
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
|
||||||
|
|
||||||
float tab[16];
|
float tab[16];
|
||||||
float tab1y[4], tab1x[4];
|
float tab1y[4], tab1x[4];
|
||||||
|
@ -47,8 +47,12 @@
|
|||||||
//wrapPerspective kernel
|
//wrapPerspective kernel
|
||||||
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
|
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
|
||||||
|
|
||||||
#if defined DOUBLE_SUPPORT
|
#if defined (DOUBLE_SUPPORT)
|
||||||
|
#ifdef cl_khr_fp64
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||||
|
#elif defined (cl_amd_fp64)
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||||
|
#endif
|
||||||
typedef double F;
|
typedef double F;
|
||||||
typedef double4 F4;
|
typedef double4 F4;
|
||||||
#define convert_F4 convert_double4
|
#define convert_F4 convert_double4
|
||||||
@ -112,7 +116,7 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
|
|||||||
sval.s1 = scon.s1 ? src[spos.s1] : 0;
|
sval.s1 = scon.s1 ? src[spos.s1] : 0;
|
||||||
sval.s2 = scon.s2 ? src[spos.s2] : 0;
|
sval.s2 = scon.s2 ? src[spos.s2] : 0;
|
||||||
sval.s3 = scon.s3 ? src[spos.s3] : 0;
|
sval.s3 = scon.s3 ? src[spos.s3] : 0;
|
||||||
dval = convert_uchar4(dcon != 0) ? sval : dval;
|
dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
|
||||||
*d = dval;
|
*d = dval;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -142,7 +146,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
|
|||||||
int i;
|
int i;
|
||||||
#pragma unroll 4
|
#pragma unroll 4
|
||||||
for(i=0; i<4; i++)
|
for(i=0; i<4; i++)
|
||||||
v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0;
|
v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0;
|
||||||
|
|
||||||
short itab[4];
|
short itab[4];
|
||||||
float tab1y[2], tab1x[2];
|
float tab1y[2], tab1x[2];
|
||||||
@ -197,7 +201,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
|
|||||||
for(i=0; i<4; i++)
|
for(i=0; i<4; i++)
|
||||||
for(j=0; j<4; j++)
|
for(j=0; j<4; j++)
|
||||||
{
|
{
|
||||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
|
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
short itab[16];
|
short itab[16];
|
||||||
@ -299,10 +303,10 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
|
|||||||
|
|
||||||
int4 v0, v1, v2, v3;
|
int4 v0, v1, v2, v3;
|
||||||
|
|
||||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0;
|
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0;
|
||||||
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0;
|
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0;
|
||||||
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0;
|
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0;
|
||||||
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0;
|
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0;
|
||||||
|
|
||||||
int itab0, itab1, itab2, itab3;
|
int itab0, itab1, itab2, itab3;
|
||||||
float taby, tabx;
|
float taby, tabx;
|
||||||
@ -458,10 +462,10 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
|
|||||||
|
|
||||||
float v0, v1, v2, v3;
|
float v0, v1, v2, v3;
|
||||||
|
|
||||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0;
|
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0;
|
||||||
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0;
|
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0;
|
||||||
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0;
|
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0;
|
||||||
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0;
|
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0;
|
||||||
|
|
||||||
float tab[4];
|
float tab[4];
|
||||||
float taby[2], tabx[2];
|
float taby[2], tabx[2];
|
||||||
@ -510,7 +514,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
|
|||||||
int i;
|
int i;
|
||||||
|
|
||||||
for(i=0; i<16; i++)
|
for(i=0; i<16; i++)
|
||||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
|
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0;
|
||||||
|
|
||||||
float tab[16];
|
float tab[16];
|
||||||
float tab1y[4], tab1x[4];
|
float tab1y[4], tab1x[4];
|
||||||
@ -564,7 +568,7 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
|
|||||||
short sy = (short)Y;
|
short sy = (short)Y;
|
||||||
|
|
||||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0;
|
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -597,10 +601,10 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
|
|||||||
|
|
||||||
float4 v0, v1, v2, v3;
|
float4 v0, v1, v2, v3;
|
||||||
|
|
||||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
|
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
|
||||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
|
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
|
||||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
|
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
|
||||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
|
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
|
||||||
|
|
||||||
float tab[4];
|
float tab[4];
|
||||||
float taby[2], tabx[2];
|
float taby[2], tabx[2];
|
||||||
@ -652,7 +656,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
|
|||||||
int i;
|
int i;
|
||||||
|
|
||||||
for(i=0; i<16; i++)
|
for(i=0; i<16; i++)
|
||||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
|
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
|
||||||
|
|
||||||
float tab[16];
|
float tab[16];
|
||||||
float tab1y[4], tab1x[4];
|
float tab1y[4], tab1x[4];
|
||||||
|
@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0
|
|||||||
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
|
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
|
||||||
for(j = 0; j < tpl_cols; j ++)
|
for(j = 0; j < tpl_cols; j ++)
|
||||||
{
|
{
|
||||||
sum = mad24(img_ptr[j], tpl_ptr[j], sum);
|
sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
res[res_idx] = sum;
|
res[res_idx] = (float)sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0
|
|||||||
sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
|
sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
res[res_idx] = sum.x + sum.y + sum.z + sum.w;
|
res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -633,8 +633,7 @@ void matchTemplate_Prepared_CCOFF_C1_D0
|
|||||||
|
|
||||||
if(gidx < res_cols && gidy < res_rows)
|
if(gidx < res_cols && gidy < res_rows)
|
||||||
{
|
{
|
||||||
float sum = (float)(
|
float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
|
||||||
(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
|
|
||||||
-(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
|
-(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
|
||||||
res[res_idx] -= sum * tpl_sum;
|
res[res_idx] -= sum * tpl_sum;
|
||||||
}
|
}
|
||||||
|
@ -53,39 +53,48 @@
|
|||||||
|
|
||||||
//----------------------------------------------------------------------------
|
//----------------------------------------------------------------------------
|
||||||
// Histogram computation
|
// Histogram computation
|
||||||
|
// 12 threads for a cell, 12x4 threads per block
|
||||||
__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y,
|
__kernel void compute_hists_kernel(
|
||||||
|
const int cblock_stride_x, const int cblock_stride_y,
|
||||||
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
||||||
|
const int blocks_in_group, const int blocks_total,
|
||||||
const int grad_quadstep, const int qangle_step,
|
const int grad_quadstep, const int qangle_step,
|
||||||
__global const float* grad, __global const uchar* qangle,
|
__global const float* grad, __global const uchar* qangle,
|
||||||
const float scale, __global float* block_hists, __local float* smem)
|
const float scale, __global float* block_hists, __local float* smem)
|
||||||
{
|
{
|
||||||
const int lidX = get_local_id(0);
|
const int lx = get_local_id(0);
|
||||||
|
const int lp = lx / 24; /* local group id */
|
||||||
|
const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
|
||||||
|
const int gidY = gid / img_block_width;
|
||||||
|
const int gidX = gid - gidY * img_block_width;
|
||||||
|
|
||||||
|
const int lidX = lx - lp * 24;
|
||||||
const int lidY = get_local_id(1);
|
const int lidY = get_local_id(1);
|
||||||
const int gidX = get_group_id(0);
|
|
||||||
const int gidY = get_group_id(1);
|
|
||||||
|
|
||||||
const int cell_x = lidX / 16;
|
const int cell_x = lidX / 12;
|
||||||
const int cell_y = lidY;
|
const int cell_y = lidY;
|
||||||
const int cell_thread_x = lidX & 0xF;
|
const int cell_thread_x = lidX - cell_x * 12;
|
||||||
|
|
||||||
__local float* hists = smem;
|
__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
|
||||||
__local float* final_hist = smem + cnbins * 48;
|
CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
|
||||||
|
__local float* final_hist = hists + cnbins *
|
||||||
|
(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
|
||||||
|
|
||||||
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
|
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
|
||||||
const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
|
const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
|
||||||
|
|
||||||
__global const float* grad_ptr = grad + offset_y * grad_quadstep + (offset_x << 1);
|
__global const float* grad_ptr = (gid < blocks_total) ?
|
||||||
__global const uchar* qangle_ptr = qangle + offset_y * qangle_step + (offset_x << 1);
|
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
|
||||||
|
__global const uchar* qangle_ptr = (gid < blocks_total) ?
|
||||||
|
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
|
||||||
|
|
||||||
// 12 means that 12 pixels affect on block's cell (in one row)
|
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
|
||||||
if (cell_thread_x < 12)
|
cell_thread_x;
|
||||||
{
|
|
||||||
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + cell_thread_x;
|
|
||||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
|
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
|
||||||
hist[bin_id * 48] = 0.f;
|
hist[bin_id * 48] = 0.f;
|
||||||
|
|
||||||
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
|
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
|
||||||
|
const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
|
||||||
|
|
||||||
const int dist_y_begin = -4 - 4 * lidY;
|
const int dist_y_begin = -4 - 4 * lidY;
|
||||||
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
|
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
|
||||||
@ -97,33 +106,44 @@ __kernel void compute_hists_kernel(const int width, const int cblock_stride_x, c
|
|||||||
qangle_ptr += qangle_step;
|
qangle_ptr += qangle_step;
|
||||||
|
|
||||||
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
|
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
|
||||||
int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
|
|
||||||
|
|
||||||
float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale);
|
float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x *
|
||||||
float interp_weight = (8.f - fabs(dist_y + 0.5f)) * (8.f - fabs(dist_x + 0.5f)) / 64.f;
|
dist_center_x) * scale);
|
||||||
|
float interp_weight = (8.f - fabs(dist_y + 0.5f)) *
|
||||||
|
(8.f - fabs(dist_x + 0.5f)) / 64.f;
|
||||||
|
|
||||||
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
|
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
|
||||||
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
|
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
|
||||||
}
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
volatile __local float* hist_ = hist;
|
volatile __local float* hist_ = hist;
|
||||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
|
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
|
||||||
{
|
{
|
||||||
if (cell_thread_x < 6) hist_[0] += hist_[6];
|
if (cell_thread_x < 6)
|
||||||
if (cell_thread_x < 3) hist_[0] += hist_[3];
|
hist_[0] += hist_[6];
|
||||||
if (cell_thread_x == 0)
|
|
||||||
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (cell_thread_x < 3)
|
||||||
|
hist_[0] += hist_[3];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
#endif
|
||||||
|
if (cell_thread_x == 0)
|
||||||
|
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
|
||||||
|
hist_[0] + hist_[1] + hist_[2];
|
||||||
|
}
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
#endif
|
||||||
|
|
||||||
__global float* block_hist = block_hists + (gidY * img_block_width + gidX) * cblock_hist_size;
|
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
|
||||||
|
if ((tid < cblock_hist_size) && (gid < blocks_total))
|
||||||
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
|
{
|
||||||
if (tid < cblock_hist_size)
|
__global float* block_hist = block_hists +
|
||||||
|
(gidY * img_block_width + gidX) * cblock_hist_size;
|
||||||
block_hist[tid] = final_hist[tid];
|
block_hist[tid] = final_hist[tid];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//-------------------------------------------------------------
|
//-------------------------------------------------------------
|
||||||
// Normalization of histograms via L2Hys_norm
|
// Normalization of histograms via L2Hys_norm
|
||||||
@ -133,21 +153,59 @@ float reduce_smem(volatile __local float* smem, int size)
|
|||||||
unsigned int tid = get_local_id(0);
|
unsigned int tid = get_local_id(0);
|
||||||
float sum = smem[tid];
|
float sum = smem[tid];
|
||||||
|
|
||||||
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; barrier(CLK_LOCAL_MEM_FENCE); }
|
if (size >= 512)
|
||||||
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; barrier(CLK_LOCAL_MEM_FENCE); }
|
{
|
||||||
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); }
|
if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
if (size >= 256)
|
||||||
|
{
|
||||||
|
if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
if (size >= 128)
|
||||||
|
{
|
||||||
|
if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
if (tid < 32)
|
if (tid < 32)
|
||||||
{
|
{
|
||||||
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
|
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
|
||||||
|
#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if (tid < 16)
|
if (tid < 16)
|
||||||
{
|
{
|
||||||
|
#endif
|
||||||
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
|
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < 8)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
|
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < 4)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
|
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < 2)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
|
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < 1)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
|
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,19 +282,44 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
|
|||||||
if (tid < 64) products[tid] = product = product + products[tid + 64];
|
if (tid < 64) products[tid] = product = product + products[tid + 64];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
volatile __local float* smem = products;
|
||||||
if (tid < 32)
|
if (tid < 32)
|
||||||
{
|
{
|
||||||
volatile __local float* smem = products;
|
|
||||||
smem[tid] = product = product + smem[tid + 32];
|
smem[tid] = product = product + smem[tid + 32];
|
||||||
|
#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if (tid < 16)
|
if (tid < 16)
|
||||||
{
|
{
|
||||||
volatile __local float* smem = products;
|
#endif
|
||||||
smem[tid] = product = product + smem[tid + 16];
|
smem[tid] = product = product + smem[tid + 16];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < 8)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
smem[tid] = product = product + smem[tid + 8];
|
smem[tid] = product = product + smem[tid + 8];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < 4)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
smem[tid] = product = product + smem[tid + 4];
|
smem[tid] = product = product + smem[tid + 4];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < 2)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
smem[tid] = product = product + smem[tid + 2];
|
smem[tid] = product = product + smem[tid + 2];
|
||||||
|
#ifdef WAVE_SIZE_1
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < 1)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
smem[tid] = product = product + smem[tid + 1];
|
smem[tid] = product = product + smem[tid + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -184,6 +184,209 @@ float linearFilter_float(__global const float* src, int srcStep, int cn, float2
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define BUFFER 64
|
#define BUFFER 64
|
||||||
|
|
||||||
|
#ifdef CPU
|
||||||
|
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1;
|
||||||
|
smem2[tid] = val2;
|
||||||
|
smem3[tid] = val3;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#if BUFFER > 128
|
||||||
|
if (tid < 128)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1 += smem1[tid + 128];
|
||||||
|
smem2[tid] = val2 += smem2[tid + 128];
|
||||||
|
smem3[tid] = val3 += smem3[tid + 128];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if BUFFER > 64
|
||||||
|
if (tid < 64)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1 += smem1[tid + 64];
|
||||||
|
smem2[tid] = val2 += smem2[tid + 64];
|
||||||
|
smem3[tid] = val3 += smem3[tid + 64];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (tid < 32)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1 += smem1[tid + 32];
|
||||||
|
smem2[tid] = val2 += smem2[tid + 32];
|
||||||
|
smem3[tid] = val3 += smem3[tid + 32];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 16)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1 += smem1[tid + 16];
|
||||||
|
smem2[tid] = val2 += smem2[tid + 16];
|
||||||
|
smem3[tid] = val3 += smem3[tid + 16];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 8)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1 += smem1[tid + 8];
|
||||||
|
smem2[tid] = val2 += smem2[tid + 8];
|
||||||
|
smem3[tid] = val3 += smem3[tid + 8];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 4)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1 += smem1[tid + 4];
|
||||||
|
smem2[tid] = val2 += smem2[tid + 4];
|
||||||
|
smem3[tid] = val3 += smem3[tid + 4];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 2)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1 += smem1[tid + 2];
|
||||||
|
smem2[tid] = val2 += smem2[tid + 2];
|
||||||
|
smem3[tid] = val3 += smem3[tid + 2];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 1)
|
||||||
|
{
|
||||||
|
smem1[BUFFER] = val1 += smem1[tid + 1];
|
||||||
|
smem2[BUFFER] = val2 += smem2[tid + 1];
|
||||||
|
smem3[BUFFER] = val3 += smem3[tid + 1];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1;
|
||||||
|
smem2[tid] = val2;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#if BUFFER > 128
|
||||||
|
if (tid < 128)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 128]);
|
||||||
|
smem2[tid] = (val2 += smem2[tid + 128]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if BUFFER > 64
|
||||||
|
if (tid < 64)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 64]);
|
||||||
|
smem2[tid] = (val2 += smem2[tid + 64]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (tid < 32)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 32]);
|
||||||
|
smem2[tid] = (val2 += smem2[tid + 32]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 16)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 16]);
|
||||||
|
smem2[tid] = (val2 += smem2[tid + 16]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 8)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 8]);
|
||||||
|
smem2[tid] = (val2 += smem2[tid + 8]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 4)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 4]);
|
||||||
|
smem2[tid] = (val2 += smem2[tid + 4]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 2)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 2]);
|
||||||
|
smem2[tid] = (val2 += smem2[tid + 2]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 1)
|
||||||
|
{
|
||||||
|
smem1[BUFFER] = (val1 += smem1[tid + 1]);
|
||||||
|
smem2[BUFFER] = (val2 += smem2[tid + 1]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
void reduce1(float val1, volatile __local float* smem1, int tid)
|
||||||
|
{
|
||||||
|
smem1[tid] = val1;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#if BUFFER > 128
|
||||||
|
if (tid < 128)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 128]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if BUFFER > 64
|
||||||
|
if (tid < 64)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 64]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (tid < 32)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 32]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 16)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 16]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 8)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 8]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 4)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 4]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 2)
|
||||||
|
{
|
||||||
|
smem1[tid] = (val1 += smem1[tid + 2]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (tid < 1)
|
||||||
|
{
|
||||||
|
smem1[BUFFER] = (val1 += smem1[tid + 1]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
#else
|
||||||
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
|
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
|
||||||
{
|
{
|
||||||
smem1[tid] = val1;
|
smem1[tid] = val1;
|
||||||
@ -325,6 +528,7 @@ void reduce1(float val1, __local float* smem1, int tid)
|
|||||||
vmem1[tid] = val1 += vmem1[tid + 1];
|
vmem1[tid] = val1 += vmem1[tid + 1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#define SCALE (1.0f / (1 << 20))
|
#define SCALE (1.0f / (1 << 20))
|
||||||
#define THRESHOLD 0.01f
|
#define THRESHOLD 0.01f
|
||||||
@ -411,14 +615,20 @@ void GetError4(image2d_t J, const float x, const float y, const float4* Pch, flo
|
|||||||
*errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
|
*errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define GRIDSIZE 3
|
||||||
__kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
__kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||||
__global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
|
__global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
|
||||||
const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
|
const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
|
||||||
{
|
{
|
||||||
|
#ifdef CPU
|
||||||
|
__local float smem1[BUFFER+1];
|
||||||
|
__local float smem2[BUFFER+1];
|
||||||
|
__local float smem3[BUFFER+1];
|
||||||
|
#else
|
||||||
__local float smem1[BUFFER];
|
__local float smem1[BUFFER];
|
||||||
__local float smem2[BUFFER];
|
__local float smem2[BUFFER];
|
||||||
__local float smem3[BUFFER];
|
__local float smem3[BUFFER];
|
||||||
|
#endif
|
||||||
|
|
||||||
unsigned int xid=get_local_id(0);
|
unsigned int xid=get_local_id(0);
|
||||||
unsigned int yid=get_local_id(1);
|
unsigned int yid=get_local_id(1);
|
||||||
@ -431,7 +641,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
|||||||
|
|
||||||
const int tid = mad24(yid, xsize, xid);
|
const int tid = mad24(yid, xsize, xid);
|
||||||
|
|
||||||
float2 prevPt = prevPts[gid] / (1 << level);
|
float2 prevPt = prevPts[gid] / (float2)(1 << level);
|
||||||
|
|
||||||
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
|
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
|
||||||
{
|
{
|
||||||
@ -450,9 +660,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
|||||||
float A12 = 0;
|
float A12 = 0;
|
||||||
float A22 = 0;
|
float A22 = 0;
|
||||||
|
|
||||||
float I_patch[3][3];
|
float I_patch[GRIDSIZE][GRIDSIZE];
|
||||||
float dIdx_patch[3][3];
|
float dIdx_patch[GRIDSIZE][GRIDSIZE];
|
||||||
float dIdy_patch[3][3];
|
float dIdy_patch[GRIDSIZE][GRIDSIZE];
|
||||||
|
|
||||||
yBase=yid;
|
yBase=yid;
|
||||||
{
|
{
|
||||||
@ -512,12 +722,19 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
|||||||
&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
|
&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
|
||||||
&A11, &A12, &A22);
|
&A11, &A12, &A22);
|
||||||
}
|
}
|
||||||
|
|
||||||
reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
|
reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#ifdef CPU
|
||||||
|
A11 = smem1[BUFFER];
|
||||||
|
A12 = smem2[BUFFER];
|
||||||
|
A22 = smem3[BUFFER];
|
||||||
|
#else
|
||||||
A11 = smem1[0];
|
A11 = smem1[0];
|
||||||
A12 = smem2[0];
|
A12 = smem2[0];
|
||||||
A22 = smem3[0];
|
A22 = smem3[0];
|
||||||
|
#endif
|
||||||
|
|
||||||
float D = A11 * A22 - A12 * A12;
|
float D = A11 * A22 - A12 * A12;
|
||||||
|
|
||||||
@ -609,8 +826,13 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
|||||||
reduce2(b1, b2, smem1, smem2, tid);
|
reduce2(b1, b2, smem1, smem2, tid);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#ifdef CPU
|
||||||
|
b1 = smem1[BUFFER];
|
||||||
|
b2 = smem2[BUFFER];
|
||||||
|
#else
|
||||||
b1 = smem1[0];
|
b1 = smem1[0];
|
||||||
b2 = smem2[0];
|
b2 = smem2[0];
|
||||||
|
#endif
|
||||||
|
|
||||||
float2 delta;
|
float2 delta;
|
||||||
delta.x = A12 * b2 - A22 * b1;
|
delta.x = A12 * b2 - A22 * b1;
|
||||||
@ -685,18 +907,28 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
|||||||
nextPts[gid] = prevPt;
|
nextPts[gid] = prevPt;
|
||||||
|
|
||||||
if (calcErr)
|
if (calcErr)
|
||||||
err[gid] = smem1[0] / (c_winSize_x * c_winSize_y);
|
#ifdef CPU
|
||||||
|
err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y);
|
||||||
|
#else
|
||||||
|
err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
__kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||||
__global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
|
__global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
|
||||||
const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
|
const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
|
||||||
{
|
{
|
||||||
|
#ifdef CPU
|
||||||
|
__local float smem1[BUFFER+1];
|
||||||
|
__local float smem2[BUFFER+1];
|
||||||
|
__local float smem3[BUFFER+1];
|
||||||
|
#else
|
||||||
__local float smem1[BUFFER];
|
__local float smem1[BUFFER];
|
||||||
__local float smem2[BUFFER];
|
__local float smem2[BUFFER];
|
||||||
__local float smem3[BUFFER];
|
__local float smem3[BUFFER];
|
||||||
|
#endif
|
||||||
|
|
||||||
unsigned int xid=get_local_id(0);
|
unsigned int xid=get_local_id(0);
|
||||||
unsigned int yid=get_local_id(1);
|
unsigned int yid=get_local_id(1);
|
||||||
@ -709,7 +941,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
|||||||
|
|
||||||
const int tid = mad24(yid, xsize, xid);
|
const int tid = mad24(yid, xsize, xid);
|
||||||
|
|
||||||
float2 nextPt = prevPts[gid]/(1<<level);
|
float2 nextPt = prevPts[gid]/(float2)(1<<level);
|
||||||
|
|
||||||
if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
|
if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
|
||||||
{
|
{
|
||||||
@ -725,9 +957,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
|||||||
|
|
||||||
// extract the patch from the first image, compute covariation matrix of derivatives
|
// extract the patch from the first image, compute covariation matrix of derivatives
|
||||||
|
|
||||||
float A11 = 0;
|
float A11 = 0.0f;
|
||||||
float A12 = 0;
|
float A12 = 0.0f;
|
||||||
float A22 = 0;
|
float A22 = 0.0f;
|
||||||
|
|
||||||
float4 I_patch[8];
|
float4 I_patch[8];
|
||||||
float4 dIdx_patch[8];
|
float4 dIdx_patch[8];
|
||||||
@ -797,9 +1029,15 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
|||||||
reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
|
reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#ifdef CPU
|
||||||
|
A11 = smem1[BUFFER];
|
||||||
|
A12 = smem2[BUFFER];
|
||||||
|
A22 = smem3[BUFFER];
|
||||||
|
#else
|
||||||
A11 = smem1[0];
|
A11 = smem1[0];
|
||||||
A12 = smem2[0];
|
A12 = smem2[0];
|
||||||
A22 = smem3[0];
|
A22 = smem3[0];
|
||||||
|
#endif
|
||||||
|
|
||||||
float D = A11 * A22 - A12 * A12;
|
float D = A11 * A22 - A12 * A12;
|
||||||
|
|
||||||
@ -888,12 +1126,16 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
|||||||
&b1, &b2);
|
&b1, &b2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
reduce2(b1, b2, smem1, smem2, tid);
|
reduce2(b1, b2, smem1, smem2, tid);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#ifdef CPU
|
||||||
|
b1 = smem1[BUFFER];
|
||||||
|
b2 = smem2[BUFFER];
|
||||||
|
#else
|
||||||
b1 = smem1[0];
|
b1 = smem1[0];
|
||||||
b2 = smem2[0];
|
b2 = smem2[0];
|
||||||
|
#endif
|
||||||
|
|
||||||
float2 delta;
|
float2 delta;
|
||||||
delta.x = A12 * b2 - A22 * b1;
|
delta.x = A12 * b2 - A22 * b1;
|
||||||
@ -967,7 +1209,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
|||||||
nextPts[gid] = nextPt;
|
nextPts[gid] = nextPt;
|
||||||
|
|
||||||
if (calcErr)
|
if (calcErr)
|
||||||
err[gid] = smem1[0] / (3 * c_winSize_x * c_winSize_y);
|
#ifdef CPU
|
||||||
|
err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y);
|
||||||
|
#else
|
||||||
|
err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -251,9 +251,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
|||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
|
barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
|
||||||
|
|
||||||
|
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
|
||||||
if (X < cwidth - radius && Y < cheight - radius)
|
if (X < cwidth - radius && Y < cheight - radius)
|
||||||
{
|
{
|
||||||
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
|
|
||||||
if (minSSD.x < minSSDImage[0])
|
if (minSSD.x < minSSDImage[0])
|
||||||
{
|
{
|
||||||
disparImage[0] = (unsigned char)(d + minSSD.y);
|
disparImage[0] = (unsigned char)(d + minSSD.y);
|
||||||
@ -264,7 +264,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
|||||||
for(int row = 1; row < end_row; row++)
|
for(int row = 1; row < end_row; row++)
|
||||||
{
|
{
|
||||||
int idx1 = y_tex * img_step + x_tex;
|
int idx1 = y_tex * img_step + x_tex;
|
||||||
int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex;
|
int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex;
|
||||||
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
@ -278,10 +278,10 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
|||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
|
||||||
if (X < cwidth - radius && row < cheight - radius - Y)
|
if (X < cwidth - radius && row < cheight - radius - Y)
|
||||||
{
|
{
|
||||||
int idx = row * cminSSD_step;
|
int idx = row * cminSSD_step;
|
||||||
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
|
|
||||||
if (minSSD.x < minSSDImage[idx])
|
if (minSSD.x < minSSDImage[idx])
|
||||||
{
|
{
|
||||||
disparImage[disp_step * row] = (unsigned char)(d + minSSD.y);
|
disparImage[disp_step * row] = (unsigned char)(d + minSSD.y);
|
||||||
|
@ -115,10 +115,9 @@ int main(int argc, char **argv)
|
|||||||
std::cout << "platform invalid\n";
|
std::cout << "platform invalid\n";
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if(pid != 0 || device != 0)
|
|
||||||
{
|
|
||||||
setDevice(oclinfo[pid], device);
|
setDevice(oclinfo[pid], device);
|
||||||
}
|
|
||||||
cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
|
cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
|
||||||
return RUN_ALL_TESTS();
|
return RUN_ALL_TESTS();
|
||||||
}
|
}
|
||||||
|
@ -1531,6 +1531,10 @@ INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
|
|||||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||||
Values(false)));
|
Values(false)));
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(
|
||||||
|
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||||
|
Values(false)));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
|
INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
|
||||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||||
Values(false))); // Values(false) is the reserved parameter
|
Values(false))); // Values(false) is the reserved parameter
|
||||||
@ -1586,19 +1590,19 @@ INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC3, CV_32F
|
|||||||
|
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
|
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
|
||||||
Values(CV_8UC1, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
|
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
|
||||||
//Values(false) is the reserved parameter
|
//Values(false) is the reserved parameter
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
|
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
|
||||||
Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
|
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
|
||||||
//Values(false) is the reserved parameter
|
//Values(false) is the reserved parameter
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
|
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
|
||||||
Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
|
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
|
||||||
//Values(false) is the reserved parameter
|
//Values(false) is the reserved parameter
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
|
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
|
||||||
Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
|
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
|
||||||
//Values(false) is the reserved parameter
|
//Values(false) is the reserved parameter
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false)));
|
INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false)));
|
||||||
|
@ -43,16 +43,14 @@
|
|||||||
#ifdef HAVE_OPENCL
|
#ifdef HAVE_OPENCL
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// BruteForceMatcher
|
// BruteForceMatcher
|
||||||
|
CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist,\
|
||||||
CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist, cv::ocl::BruteForceMatcher_OCL_base::L2Dist, cv::ocl::BruteForceMatcher_OCL_base::HammingDist)
|
cv::ocl::BruteForceMatcher_OCL_base::L2Dist,\
|
||||||
|
cv::ocl::BruteForceMatcher_OCL_base::HammingDist)
|
||||||
IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
|
IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
|
||||||
|
PARAM_TEST_CASE(BruteForceMatcher, DistType, DescriptorSize)
|
||||||
PARAM_TEST_CASE(BruteForceMatcher/*, NormCode*/, DistType, DescriptorSize)
|
|
||||||
{
|
{
|
||||||
//std::vector<cv::ocl::Info> oclinfo;
|
|
||||||
cv::ocl::BruteForceMatcher_OCL_base::DistType distType;
|
cv::ocl::BruteForceMatcher_OCL_base::DistType distType;
|
||||||
int normCode;
|
int normCode;
|
||||||
int dim;
|
int dim;
|
||||||
@ -64,13 +62,9 @@ namespace
|
|||||||
|
|
||||||
virtual void SetUp()
|
virtual void SetUp()
|
||||||
{
|
{
|
||||||
//normCode = GET_PARAM(0);
|
|
||||||
distType = (cv::ocl::BruteForceMatcher_OCL_base::DistType)(int)GET_PARAM(0);
|
distType = (cv::ocl::BruteForceMatcher_OCL_base::DistType)(int)GET_PARAM(0);
|
||||||
dim = GET_PARAM(1);
|
dim = GET_PARAM(1);
|
||||||
|
|
||||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
|
||||||
//CV_Assert(devnums > 0);
|
|
||||||
|
|
||||||
queryDescCount = 300; // must be even number because we split train data in some cases in two
|
queryDescCount = 300; // must be even number because we split train data in some cases in two
|
||||||
countFactor = 4; // do not change it
|
countFactor = 4; // do not change it
|
||||||
|
|
||||||
@ -172,21 +166,6 @@ namespace
|
|||||||
|
|
||||||
cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
|
cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
|
||||||
|
|
||||||
// assume support atomic.
|
|
||||||
//if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
|
|
||||||
//{
|
|
||||||
// try
|
|
||||||
// {
|
|
||||||
// std::vector< std::vector<cv::DMatch> > matches;
|
|
||||||
// matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
|
|
||||||
// }
|
|
||||||
// catch (const cv::Exception& e)
|
|
||||||
// {
|
|
||||||
// ASSERT_EQ(CV_StsNotImplemented, e.code);
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
//else
|
|
||||||
{
|
|
||||||
std::vector< std::vector<cv::DMatch> > matches;
|
std::vector< std::vector<cv::DMatch> > matches;
|
||||||
matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius);
|
matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius);
|
||||||
|
|
||||||
@ -209,10 +188,9 @@ namespace
|
|||||||
|
|
||||||
ASSERT_EQ(0, badCount);
|
ASSERT_EQ(0, badCount);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
|
INSTANTIATE_TEST_CASE_P(OCL_Features2D, BruteForceMatcher,
|
||||||
//ALL_DEVICES,
|
testing::Combine(
|
||||||
testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)),
|
testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)),
|
||||||
testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304))));
|
testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304))));
|
||||||
|
|
||||||
|
2
modules/python/src2/cv.py
Executable file → Normal file
2
modules/python/src2/cv.py
Executable file → Normal file
@ -1,3 +1 @@
|
|||||||
#/usr/bin/env python
|
|
||||||
|
|
||||||
from cv2.cv import *
|
from cv2.cv import *
|
||||||
|
@ -396,7 +396,7 @@ static PyObject* pyopencv_from(const Mat& m)
|
|||||||
if(!p->refcount || p->allocator != &g_numpyAllocator)
|
if(!p->refcount || p->allocator != &g_numpyAllocator)
|
||||||
{
|
{
|
||||||
temp.allocator = &g_numpyAllocator;
|
temp.allocator = &g_numpyAllocator;
|
||||||
m.copyTo(temp);
|
ERRWRAP2(m.copyTo(temp));
|
||||||
p = &temp;
|
p = &temp;
|
||||||
}
|
}
|
||||||
p->addref();
|
p->addref();
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from string import Template
|
from string import Template
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import hdr_parser, sys, re, os, cStringIO
|
import hdr_parser, sys, re, os, cStringIO
|
||||||
from string import Template
|
from string import Template
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import os, sys, re, string
|
import os, sys, re, string
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
# Calculating and displaying 2D Hue-Saturation histogram of a color image
|
# Calculating and displaying 2D Hue-Saturation histogram of a color image
|
||||||
import sys
|
import sys
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import math
|
import math
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import cv2.cv as cv
|
import cv2.cv as cv
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import cv2.cv as cv
|
import cv2.cv as cv
|
||||||
import unittest
|
import unittest
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import cv2.cv as cv
|
import cv2.cv as cv
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import cv2.cv as cv
|
import cv2.cv as cv
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import cv2.cv as cv
|
import cv2.cv as cv
|
||||||
import math
|
import math
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import cv2.cv as cv
|
import cv2.cv as cv
|
||||||
import math
|
import math
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import cv2.cv as cv
|
import cv2.cv as cv
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import random
|
import random
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import random
|
import random
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import urllib
|
import urllib
|
||||||
import cv2.cv as cv
|
import cv2.cv as cv
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import random
|
import random
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# transformations.py
|
# transformations.py
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import sys, re, os.path
|
import sys, re, os.path
|
||||||
from xml.dom.minidom import parse
|
from xml.dom.minidom import parse
|
||||||
|
@ -1,11 +0,0 @@
|
|||||||
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
|
|
||||||
xmlns:tools="http://schemas.android.com/tools"
|
|
||||||
android:layout_width="match_parent"
|
|
||||||
android:layout_height="match_parent" >
|
|
||||||
|
|
||||||
<org.opencv.android.JavaCameraView
|
|
||||||
android:layout_width="fill_parent"
|
|
||||||
android:layout_height="fill_parent"
|
|
||||||
android:id="@+id/puzzle_activity_surface_view" />
|
|
||||||
|
|
||||||
</LinearLayout>
|
|
@ -1,6 +0,0 @@
|
|||||||
<menu xmlns:android="http://schemas.android.com/apk/res/android">
|
|
||||||
<item android:id="@+id/menu_start_new_game"
|
|
||||||
android:title="@string/menu_start_new_game"
|
|
||||||
android:orderInCategory="100" />
|
|
||||||
<item android:id="@+id/menu_toggle_tile_numbers" android:title="@string/menu_toggle_tile_numbers"></item>
|
|
||||||
</menu>
|
|
@ -6,6 +6,7 @@ import org.opencv.android.OpenCVLoader;
|
|||||||
import org.opencv.core.Mat;
|
import org.opencv.core.Mat;
|
||||||
import org.opencv.android.CameraBridgeViewBase;
|
import org.opencv.android.CameraBridgeViewBase;
|
||||||
import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener;
|
import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener;
|
||||||
|
import org.opencv.android.JavaCameraView;
|
||||||
|
|
||||||
import android.os.Bundle;
|
import android.os.Bundle;
|
||||||
import android.app.Activity;
|
import android.app.Activity;
|
||||||
@ -22,6 +23,9 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener,
|
|||||||
|
|
||||||
private CameraBridgeViewBase mOpenCvCameraView;
|
private CameraBridgeViewBase mOpenCvCameraView;
|
||||||
private Puzzle15Processor mPuzzle15;
|
private Puzzle15Processor mPuzzle15;
|
||||||
|
private MenuItem mItemHideNumbers;
|
||||||
|
private MenuItem mItemStartNewGame;
|
||||||
|
|
||||||
|
|
||||||
private int mGameWidth;
|
private int mGameWidth;
|
||||||
private int mGameHeight;
|
private int mGameHeight;
|
||||||
@ -52,9 +56,9 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener,
|
|||||||
super.onCreate(savedInstanceState);
|
super.onCreate(savedInstanceState);
|
||||||
getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
|
getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
|
||||||
|
|
||||||
setContentView(R.layout.activity_puzzle15);
|
Log.d(TAG, "Creating and seting view");
|
||||||
|
mOpenCvCameraView = (CameraBridgeViewBase) new JavaCameraView(this, -1);
|
||||||
mOpenCvCameraView = (CameraBridgeViewBase) findViewById(R.id.puzzle_activity_surface_view);
|
setContentView(mOpenCvCameraView);
|
||||||
mOpenCvCameraView.setCvCameraViewListener(this);
|
mOpenCvCameraView.setCvCameraViewListener(this);
|
||||||
mPuzzle15 = new Puzzle15Processor();
|
mPuzzle15 = new Puzzle15Processor();
|
||||||
mPuzzle15.prepareNewGame();
|
mPuzzle15.prepareNewGame();
|
||||||
@ -83,17 +87,19 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener,
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean onCreateOptionsMenu(Menu menu) {
|
public boolean onCreateOptionsMenu(Menu menu) {
|
||||||
getMenuInflater().inflate(R.menu.activity_puzzle15, menu);
|
Log.i(TAG, "called onCreateOptionsMenu");
|
||||||
|
mItemHideNumbers = menu.add("Show/hide tile numbers");
|
||||||
|
mItemStartNewGame = menu.add("Start new game");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean onOptionsItemSelected(MenuItem item) {
|
public boolean onOptionsItemSelected(MenuItem item) {
|
||||||
Log.i(TAG, "Menu Item selected " + item);
|
Log.i(TAG, "Menu Item selected " + item);
|
||||||
if (item.getItemId() == R.id.menu_start_new_game) {
|
if (item == mItemStartNewGame) {
|
||||||
/* We need to start new game */
|
/* We need to start new game */
|
||||||
mPuzzle15.prepareNewGame();
|
mPuzzle15.prepareNewGame();
|
||||||
} else if (item.getItemId() == R.id.menu_toggle_tile_numbers) {
|
} else if (item == mItemHideNumbers) {
|
||||||
/* We need to enable or disable drawing of the tile numbers */
|
/* We need to enable or disable drawing of the tile numbers */
|
||||||
mPuzzle15.toggleTileNumbers();
|
mPuzzle15.toggleTileNumbers();
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,7 @@ include ../../sdk/native/jni/OpenCV.mk
|
|||||||
|
|
||||||
LOCAL_MODULE := native_activity
|
LOCAL_MODULE := native_activity
|
||||||
LOCAL_SRC_FILES := native.cpp
|
LOCAL_SRC_FILES := native.cpp
|
||||||
LOCAL_LDLIBS := -lm -llog -landroid
|
LOCAL_LDLIBS += -lm -llog -landroid
|
||||||
LOCAL_STATIC_LIBRARIES := android_native_app_glue
|
LOCAL_STATIC_LIBRARIES := android_native_app_glue
|
||||||
|
|
||||||
include $(BUILD_SHARED_LIBRARY)
|
include $(BUILD_SHARED_LIBRARY)
|
||||||
|
@ -1,2 +1,4 @@
|
|||||||
APP_ABI := armeabi-v7a
|
APP_ABI := armeabi-v7a
|
||||||
|
APP_STL := gnustl_static
|
||||||
|
APP_CPPFLAGS := -frtti -fexceptions
|
||||||
APP_PLATFORM := android-9
|
APP_PLATFORM := android-9
|
||||||
|
@ -73,7 +73,7 @@ int main( int argc, char** argv ) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Mat imgB = imread(argv[2], IMREAD_GRAYSCALE );
|
Mat imgB = imread(argv[2], IMREAD_GRAYSCALE );
|
||||||
if( !imgA.data ) {
|
if( !imgB.data ) {
|
||||||
std::cout << " --(!) Error reading image " << argv[2] << std::endl;
|
std::cout << " --(!) Error reading image " << argv[2] << std::endl;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -12,9 +12,8 @@ static void help()
|
|||||||
{
|
{
|
||||||
// print a welcome message, and the OpenCV version
|
// print a welcome message, and the OpenCV version
|
||||||
cout << "\nThis is a demo of Lukas-Kanade optical flow lkdemo(),\n"
|
cout << "\nThis is a demo of Lukas-Kanade optical flow lkdemo(),\n"
|
||||||
"Using OpenCV version %s\n" << CV_VERSION << "\n"
|
"Using OpenCV version " << CV_VERSION << endl;
|
||||||
<< endl;
|
cout << "\nIt uses camera by default, but you can provide a path to video as an argument.\n";
|
||||||
|
|
||||||
cout << "\nHot keys: \n"
|
cout << "\nHot keys: \n"
|
||||||
"\tESC - quit the program\n"
|
"\tESC - quit the program\n"
|
||||||
"\tr - auto-initialize tracking\n"
|
"\tr - auto-initialize tracking\n"
|
||||||
@ -37,6 +36,8 @@ static void onMouse( int event, int x, int y, int /*flags*/, void* /*param*/ )
|
|||||||
|
|
||||||
int main( int argc, char** argv )
|
int main( int argc, char** argv )
|
||||||
{
|
{
|
||||||
|
help();
|
||||||
|
|
||||||
VideoCapture cap;
|
VideoCapture cap;
|
||||||
TermCriteria termcrit(TermCriteria::COUNT|TermCriteria::EPS,20,0.03);
|
TermCriteria termcrit(TermCriteria::COUNT|TermCriteria::EPS,20,0.03);
|
||||||
Size subPixWinSize(10,10), winSize(31,31);
|
Size subPixWinSize(10,10), winSize(31,31);
|
||||||
@ -56,8 +57,6 @@ int main( int argc, char** argv )
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
help();
|
|
||||||
|
|
||||||
namedWindow( "LK Demo", 1 );
|
namedWindow( "LK Demo", 1 );
|
||||||
setMouseCallback( "LK Demo", onMouse, 0 );
|
setMouseCallback( "LK Demo", onMouse, 0 );
|
||||||
|
|
||||||
@ -134,17 +133,16 @@ int main( int argc, char** argv )
|
|||||||
needToInit = true;
|
needToInit = true;
|
||||||
break;
|
break;
|
||||||
case 'c':
|
case 'c':
|
||||||
|
points[0].clear();
|
||||||
points[1].clear();
|
points[1].clear();
|
||||||
break;
|
break;
|
||||||
case 'n':
|
case 'n':
|
||||||
nightMode = !nightMode;
|
nightMode = !nightMode;
|
||||||
break;
|
break;
|
||||||
default:
|
|
||||||
;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::swap(points[1], points[0]);
|
std::swap(points[1], points[0]);
|
||||||
swap(prevGray, gray);
|
cv::swap(prevGray, gray);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
'''
|
'''
|
||||||
This module contains some common routines used by other samples.
|
This module contains some common routines used by other samples.
|
||||||
'''
|
'''
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Watershed segmentation
|
Watershed segmentation
|
||||||
|
Loading…
Reference in New Issue
Block a user