diff --git a/android/service/engine/src/org/opencv/engine/BinderConnector.java b/android/service/engine/src/org/opencv/engine/BinderConnector.java index fd23fbfe49..bde54d5b96 100644 --- a/android/service/engine/src/org/opencv/engine/BinderConnector.java +++ b/android/service/engine/src/org/opencv/engine/BinderConnector.java @@ -4,23 +4,43 @@ import android.os.IBinder; public class BinderConnector { - public BinderConnector(MarketConnector Market) - { - Init(Market); - } - public native IBinder Connect(); - public boolean Disconnect() - { - Final(); - return true; + public BinderConnector(MarketConnector Market) { + mMarket = Market; } - static + public boolean Init() { + boolean result = false; + if (mIsReady) + result = Init(mMarket); + + return result; + } + + public native IBinder Connect(); + + public boolean Disconnect() { - System.loadLibrary("OpenCVEngine"); - System.loadLibrary("OpenCVEngine_jni"); + if (mIsReady) + Final(); + + return mIsReady; } private native boolean Init(MarketConnector Market); - public native void Final(); + private native void Final(); + private static boolean mIsReady = false; + private MarketConnector mMarket; + + static { + try { + System.loadLibrary("OpenCVEngine"); + System.loadLibrary("OpenCVEngine_jni"); + mIsReady = true; + } + catch(UnsatisfiedLinkError e) { + mIsReady = false; + e.printStackTrace(); + } + } + } diff --git a/android/service/engine/src/org/opencv/engine/HardwareDetector.java b/android/service/engine/src/org/opencv/engine/HardwareDetector.java index 67320865af..7fc7e1ae8a 100644 --- a/android/service/engine/src/org/opencv/engine/HardwareDetector.java +++ b/android/service/engine/src/org/opencv/engine/HardwareDetector.java @@ -47,9 +47,17 @@ public class HardwareDetector public static native int DetectKnownPlatforms(); - static - { - System.loadLibrary("OpenCVEngine"); - System.loadLibrary("OpenCVEngine_jni"); + public static boolean mIsReady = false; + + static { + try { + System.loadLibrary("OpenCVEngine"); + System.loadLibrary("OpenCVEngine_jni"); + mIsReady = true; + } + catch(UnsatisfiedLinkError e) { + mIsReady = false; + e.printStackTrace(); + } } } diff --git a/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java b/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java index df31c7fe8e..b3c4ea0575 100644 --- a/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java +++ b/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java @@ -3,31 +3,62 @@ package org.opencv.engine; import android.app.Service; import android.content.Intent; import android.os.IBinder; +import android.os.RemoteException; import android.util.Log; - public class OpenCVEngineService extends Service { private static final String TAG = "OpenCVEngine/Service"; - private IBinder mEngineInterface; + private IBinder mEngineInterface = null; private MarketConnector mMarket; private BinderConnector mNativeBinder; - public void onCreate() - { + + public void onCreate() { Log.i(TAG, "Service starting"); super.onCreate(); Log.i(TAG, "Engine binder component creating"); mMarket = new MarketConnector(getBaseContext()); mNativeBinder = new BinderConnector(mMarket); - mEngineInterface = mNativeBinder.Connect(); - Log.i(TAG, "Service started successfully"); + if (mNativeBinder.Init()) { + mEngineInterface = mNativeBinder.Connect(); + Log.i(TAG, "Service started successfully"); + } else { + Log.e(TAG, "Cannot initialize native part of OpenCV Manager!"); + Log.e(TAG, "Using stub instead"); + + mEngineInterface = new OpenCVEngineInterface.Stub() { + + @Override + public boolean installVersion(String version) throws RemoteException { + // TODO Auto-generated method stub + return false; + } + + @Override + public String getLibraryList(String version) throws RemoteException { + // TODO Auto-generated method stub + return null; + } + + @Override + public String getLibPathByVersion(String version) throws RemoteException { + // TODO Auto-generated method stub + return null; + } + + @Override + public int getEngineVersion() throws RemoteException { + return -1; + } + }; + } } - public IBinder onBind(Intent intent) - { + public IBinder onBind(Intent intent) { Log.i(TAG, "Service onBind called for intent " + intent.toString()); return mEngineInterface; } + public boolean onUnbind(Intent intent) { Log.i(TAG, "Service onUnbind called for intent " + intent.toString()); diff --git a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java index 5213d91495..3c1aac994a 100644 --- a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java +++ b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java @@ -42,6 +42,26 @@ public class ManagerActivity extends Activity @Override public void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); + + if (!HardwareDetector.mIsReady) { + Log.e(TAG, "Cannot initialize native part of OpenCV Manager!"); + + AlertDialog dialog = new AlertDialog.Builder(this).create(); + + dialog.setTitle("OpenCV Manager Error"); + dialog.setMessage("OpenCV Manager is incompatible with this device. Please replace it with an appropriate package."); + dialog.setCancelable(false); + dialog.setButton("OK", new DialogInterface.OnClickListener() { + + public void onClick(DialogInterface dialog, int which) { + finish(); + } + }); + + dialog.show(); + return; + } + setContentView(R.layout.main); TextView OsVersionView = (TextView)findViewById(R.id.OsVersionValue); @@ -186,6 +206,20 @@ public class ManagerActivity extends Activity } }); + mPackageChangeReciever = new BroadcastReceiver() { + + @Override + public void onReceive(Context context, Intent intent) { + Log.d("OpenCVManager/Reciever", "Bradcast message " + intent.getAction() + " reciever"); + Log.d("OpenCVManager/Reciever", "Filling package list on broadcast message"); + if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE)) + { + TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); + EngineVersionView.setText("not avaliable"); + } + } + }; + IntentFilter filter = new IntentFilter(); filter.addAction(Intent.ACTION_PACKAGE_ADDED); filter.addAction(Intent.ACTION_PACKAGE_CHANGED); @@ -199,17 +233,23 @@ public class ManagerActivity extends Activity @Override protected void onDestroy() { super.onDestroy(); - unregisterReceiver(mPackageChangeReciever); + if (mPackageChangeReciever != null) + unregisterReceiver(mPackageChangeReciever); } @Override protected void onResume() { super.onResume(); - Log.d(TAG, "Filling package list on resume"); - if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE)) - { - TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); - EngineVersionView.setText("not avaliable"); + if (HardwareDetector.mIsReady) { + Log.d(TAG, "Filling package list on resume"); + OpenCVEngineServiceConnection connection = new OpenCVEngineServiceConnection(); + if (!bindService(new Intent("org.opencv.engine.BIND"), connection, Context.BIND_AUTO_CREATE)) { + Log.e(TAG, "Cannot bind to OpenCV Manager service!"); + TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); + if (EngineVersionView != null) + EngineVersionView.setText("not avaliable"); + unbindService(connection); + } } } @@ -225,19 +265,7 @@ public class ManagerActivity extends Activity protected int ManagerApiLevel = 0; protected String ManagerVersion; - protected BroadcastReceiver mPackageChangeReciever = new BroadcastReceiver() { - - @Override - public void onReceive(Context context, Intent intent) { - Log.d("OpenCVManager/Reciever", "Bradcast message " + intent.getAction() + " reciever"); - Log.d("OpenCVManager/Reciever", "Filling package list on broadcast message"); - if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE)) - { - TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); - EngineVersionView.setText("not avaliable"); - } - } - }; + protected BroadcastReceiver mPackageChangeReciever = null; protected class OpenCVEngineServiceConnection implements ServiceConnection { @@ -246,6 +274,12 @@ public class ManagerActivity extends Activity public void onServiceConnected(ComponentName name, IBinder service) { OpenCVEngineInterface EngineService = OpenCVEngineInterface.Stub.asInterface(service); + if (EngineService == null) { + Log.e(TAG, "Cannot connect to OpenCV Manager Service!"); + unbindService(this); + return; + } + try { ManagerApiLevel = EngineService.getEngineVersion(); } catch (RemoteException e) { diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake index 9ee23da55b..9b841dad8a 100644 --- a/cmake/OpenCVDetectCXXCompiler.cmake +++ b/cmake/OpenCVDetectCXXCompiler.cmake @@ -27,23 +27,23 @@ endif() # the -fPIC flag should be used. # ---------------------------------------------------------------------------- if(UNIX) - if (__ICL) - set(CV_ICC __ICL) - elseif(__ICC) - set(CV_ICC __ICC) - elseif(__ECL) - set(CV_ICC __ECL) - elseif(__ECC) - set(CV_ICC __ECC) - elseif(__INTEL_COMPILER) - set(CV_ICC __INTEL_COMPILER) - elseif(CMAKE_C_COMPILER MATCHES "icc") - set(CV_ICC icc_matches_c_compiler) - endif() + if (__ICL) + set(CV_ICC __ICL) + elseif(__ICC) + set(CV_ICC __ICC) + elseif(__ECL) + set(CV_ICC __ECL) + elseif(__ECC) + set(CV_ICC __ECC) + elseif(__INTEL_COMPILER) + set(CV_ICC __INTEL_COMPILER) + elseif(CMAKE_C_COMPILER MATCHES "icc") + set(CV_ICC icc_matches_c_compiler) + endif() endif() if(MSVC AND CMAKE_C_COMPILER MATCHES "icc") - set(CV_ICC __INTEL_COMPILER_FOR_WINDOWS) + set(CV_ICC __INTEL_COMPILER_FOR_WINDOWS) endif() # ---------------------------------------------------------------------------- @@ -64,45 +64,49 @@ if(CMAKE_COMPILER_IS_CLANGCXX) string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_CLANG_REGEX_VERSION "${CMAKE_OPENCV_CLANG_VERSION_FULL}") elseif(CMAKE_COMPILER_IS_GNUCXX) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -dumpversion - OUTPUT_VARIABLE CMAKE_OPENCV_GCC_VERSION_FULL - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -dumpversion + OUTPUT_VARIABLE CMAKE_OPENCV_GCC_VERSION_FULL + OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -v - ERROR_VARIABLE CMAKE_OPENCV_GCC_INFO_FULL - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -v + ERROR_VARIABLE CMAKE_OPENCV_GCC_INFO_FULL + OUTPUT_STRIP_TRAILING_WHITESPACE) - # Typical output in CMAKE_OPENCV_GCC_VERSION_FULL: "c+//0 (whatever) 4.2.3 (...)" - # Look for the version number - string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") - if(NOT CMAKE_GCC_REGEX_VERSION) - string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") - endif() - - # Split the three parts: - string(REGEX MATCHALL "[0-9]+" CMAKE_OPENCV_GCC_VERSIONS "${CMAKE_GCC_REGEX_VERSION}") - - list(GET CMAKE_OPENCV_GCC_VERSIONS 0 CMAKE_OPENCV_GCC_VERSION_MAJOR) - list(GET CMAKE_OPENCV_GCC_VERSIONS 1 CMAKE_OPENCV_GCC_VERSION_MINOR) - - set(CMAKE_OPENCV_GCC_VERSION ${CMAKE_OPENCV_GCC_VERSION_MAJOR}${CMAKE_OPENCV_GCC_VERSION_MINOR}) - math(EXPR CMAKE_OPENCV_GCC_VERSION_NUM "${CMAKE_OPENCV_GCC_VERSION_MAJOR}*100 + ${CMAKE_OPENCV_GCC_VERSION_MINOR}") - message(STATUS "Detected version of GNU GCC: ${CMAKE_OPENCV_GCC_VERSION} (${CMAKE_OPENCV_GCC_VERSION_NUM})") - - if(WIN32) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine - OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") - set(MINGW64 1) - endif() + # Typical output in CMAKE_OPENCV_GCC_VERSION_FULL: "c+//0 (whatever) 4.2.3 (...)" + # Look for the version number + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") + if(NOT CMAKE_GCC_REGEX_VERSION) + string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") + endif() + + # Split the three parts: + string(REGEX MATCHALL "[0-9]+" CMAKE_OPENCV_GCC_VERSIONS "${CMAKE_GCC_REGEX_VERSION}") + + list(GET CMAKE_OPENCV_GCC_VERSIONS 0 CMAKE_OPENCV_GCC_VERSION_MAJOR) + list(GET CMAKE_OPENCV_GCC_VERSIONS 1 CMAKE_OPENCV_GCC_VERSION_MINOR) + + set(CMAKE_OPENCV_GCC_VERSION ${CMAKE_OPENCV_GCC_VERSION_MAJOR}${CMAKE_OPENCV_GCC_VERSION_MINOR}) + math(EXPR CMAKE_OPENCV_GCC_VERSION_NUM "${CMAKE_OPENCV_GCC_VERSION_MAJOR}*100 + ${CMAKE_OPENCV_GCC_VERSION_MINOR}") + message(STATUS "Detected version of GNU GCC: ${CMAKE_OPENCV_GCC_VERSION} (${CMAKE_OPENCV_GCC_VERSION_NUM})") + + if(WIN32) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine + OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") + set(MINGW64 1) endif() + endif() endif() -if(MINGW64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR CMAKE_GENERATOR MATCHES "Visual Studio.*Win64") - set(X86_64 1) +if(MSVC64 OR MINGW64) + set(X86_64 1) +elseif(MSVC AND NOT CMAKE_CROSSCOMPILING) + set(X86 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(X86_64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") - set(X86 1) + set(X86 1) elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*|ARM.*") - set(ARM 1) + set(ARM 1) endif() diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index 069f95981c..92655228d1 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -4,7 +4,7 @@ if(APPLE) set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory") mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) else(APPLE) - find_package(OpenCL QUIET) + #find_package(OpenCL QUIET) if (NOT OPENCL_FOUND) find_path(OPENCL_ROOT_DIR diff --git a/doc/check_docs.py b/doc/check_docs.py index 487b390e8a..c18bf07261 100755 --- a/doc/check_docs.py +++ b/doc/check_docs.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, glob diff --git a/doc/check_docs2.py b/doc/check_docs2.py index 6446edb296..60c6d7bdfb 100755 --- a/doc/check_docs2.py +++ b/doc/check_docs2.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, fnmatch, re diff --git a/doc/conf.py b/doc/conf.py index 7b9b02ecf4..4c7a15c891 100755 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- # diff --git a/doc/ocv.py b/doc/ocv.py index 4ff8a6deb5..8fcef4040b 100755 --- a/doc/ocv.py +++ b/doc/ocv.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- """ ocv domain, a modified copy of sphinx.domains.cpp + shpinx.domains.python. diff --git a/doc/patch_refman_latex.py b/doc/patch_refman_latex.py index 352c46cb56..ff762fc8f3 100755 --- a/doc/patch_refman_latex.py +++ b/doc/patch_refman_latex.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys diff --git a/doc/pattern_tools/gen_pattern.py b/doc/pattern_tools/gen_pattern.py index 45b45af2db..3643b6d3b2 100755 --- a/doc/pattern_tools/gen_pattern.py +++ b/doc/pattern_tools/gen_pattern.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python """gen_pattern.py To run: diff --git a/doc/pattern_tools/svgfig.py b/doc/pattern_tools/svgfig.py index bf182a8b09..86afa59133 100755 --- a/doc/pattern_tools/svgfig.py +++ b/doc/pattern_tools/svgfig.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # svgfig.py copyright (C) 2008 Jim Pivarski # diff --git a/doc/reformat.py b/doc/reformat.py index 00e4aae9e0..017efebb38 100755 --- a/doc/reformat.py +++ b/doc/reformat.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, re diff --git a/modules/contrib/doc/facerec/src/create_csv.py b/modules/contrib/doc/facerec/src/create_csv.py index 71d773c017..c4de778f98 100755 --- a/modules/contrib/doc/facerec/src/create_csv.py +++ b/modules/contrib/doc/facerec/src/create_csv.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys import os.path diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp index 55683ec179..602dcd1e14 100644 --- a/modules/core/test/test_io.cpp +++ b/modules/core/test/test_io.cpp @@ -455,7 +455,7 @@ protected: TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); } */ -TEST(Core_globbing, accurasy) +TEST(Core_globbing, accuracy) { std::string patternLena = cvtest::TS::ptr()->get_data_path() + "lena*.*"; std::string patternLenaPng = cvtest::TS::ptr()->get_data_path() + "lena.png"; diff --git a/modules/gpu/misc/mark_nvidia.py b/modules/gpu/misc/mark_nvidia.py index e8cc3e8417..08743fb136 100755 --- a/modules/gpu/misc/mark_nvidia.py +++ b/modules/gpu/misc/mark_nvidia.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, re diff --git a/modules/highgui/src/cap_v4l.cpp b/modules/highgui/src/cap_v4l.cpp index 829d0ab638..6fc122fd0c 100644 --- a/modules/highgui/src/cap_v4l.cpp +++ b/modules/highgui/src/cap_v4l.cpp @@ -154,6 +154,11 @@ the symptoms were damaged image and 'Corrupt JPEG data: premature end of data se - USE_TEMP_BUFFER fixes the main problem (improper buffer management) and prevents bad images in the first place +11th patch: April 2, 2013, Forrest Reiling forrest.reiling@gmail.com +Added v4l2 support for getting capture property CV_CAP_PROP_POS_MSEC. +Returns the millisecond timestamp of the last frame grabbed or 0 if no frames have been grabbed +Used to successfully synchonize 2 Logitech C310 USB webcams to within 16 ms of one another + make & enjoy! @@ -320,6 +325,8 @@ typedef struct CvCaptureCAM_V4L struct v4l2_queryctrl queryctrl; struct v4l2_querymenu querymenu; + struct timeval timestamp; + /* V4L2 control variables */ int v4l2_brightness, v4l2_brightness_min, v4l2_brightness_max; int v4l2_contrast, v4l2_contrast_min, v4l2_contrast_max; @@ -836,6 +843,9 @@ static int _capture_V4L2 (CvCaptureCAM_V4L *capture, char *deviceName) capture->v4l2_gain_max = 0; capture->v4l2_exposure_max = 0; + capture->timestamp.tv_sec = 0; + capture->timestamp.tv_usec = 0; + /* Scan V4L2 controls */ v4l2_scan_controls(capture); @@ -1221,6 +1231,9 @@ static int read_frame_v4l2(CvCaptureCAM_V4L* capture) { if (-1 == ioctl (capture->deviceHandle, VIDIOC_QBUF, &buf)) perror ("VIDIOC_QBUF"); + //set timestamp in capture struct to be timestamp of most recent frame + capture->timestamp = buf.timestamp; + return 1; } @@ -2308,6 +2321,13 @@ static double icvGetPropertyCAM_V4L (CvCaptureCAM_V4L* capture, /* initialize the control structure */ switch (property_id) { + case CV_CAP_PROP_POS_MSEC: + if (capture->FirstCapture) { + return 0; + } else { + return 1000 * capture->timestamp.tv_sec + ((double) capture->timestamp.tv_usec) / 1000; + } + break; case CV_CAP_PROP_BRIGHTNESS: capture->control.id = V4L2_CID_BRIGHTNESS; break; diff --git a/modules/java/check-tests.py b/modules/java/check-tests.py index 4cb80ff724..c4d34f61e9 100755 --- a/modules/java/check-tests.py +++ b/modules/java/check-tests.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, os, re diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py index e48b682437..4b2b9c4a8a 100755 --- a/modules/java/generator/gen_java.py +++ b/modules/java/generator/gen_java.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, re, os.path from string import Template diff --git a/modules/java/generator/gen_javadoc.py b/modules/java/generator/gen_javadoc.py index 71372d3a2c..dfa591a959 100755 --- a/modules/java/generator/gen_javadoc.py +++ b/modules/java/generator/gen_javadoc.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, re, string, glob from optparse import OptionParser diff --git a/modules/java/generator/rst_parser.py b/modules/java/generator/rst_parser.py index a6ee3f0b32..ad8358542c 100755 --- a/modules/java/generator/rst_parser.py +++ b/modules/java/generator/rst_parser.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, re, string, fnmatch allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "gpu", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "softcascade", "superres"] diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index 3b7e7a9bbb..508fb3f127 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -76,7 +76,7 @@ namespace cv size_t wave_size = 0; queryDeviceInfo(WAVEFRONT_SIZE, &wave_size); - std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast(wave_size)); + std::sprintf(pSURF_OPTIONS, "-D WAVE_SIZE=%d", static_cast(wave_size)); OPTION_INIT = true; } openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS); diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp index 982084b7e0..9ac032a29a 100644 --- a/modules/ocl/include/opencv2/ocl/private/util.hpp +++ b/modules/ocl/include/opencv2/ocl/private/util.hpp @@ -127,8 +127,9 @@ namespace cv // currently only support wavefront size queries enum DEVICE_INFO { - WAVEFRONT_SIZE, //in AMD speak - WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak + WAVEFRONT_SIZE, //in AMD speak + WARP_SIZE = WAVEFRONT_SIZE, //in nvidia speak + IS_CPU_DEVICE //check if the device is CPU }; //info should have been pre-allocated void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info); diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 82208f2524..6e57bb884e 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -91,9 +91,6 @@ namespace cv extern const char *arithm_bitwise_xor_scalar_mask; extern const char *arithm_compare_eq; extern const char *arithm_compare_ne; - extern const char *arithm_sub; - extern const char *arithm_sub_scalar; - extern const char *arithm_sub_scalar_mask; extern const char *arithm_mul; extern const char *arithm_div; extern const char *arithm_absdiff; @@ -260,11 +257,11 @@ void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const ocl void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst) { - arithmetic_run(src1, src2, dst, "arithm_sub", &arithm_sub); + arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add); } void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) { - arithmetic_run(src1, src2, dst, mask, "arithm_sub_with_mask", &arithm_sub); + arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add); } typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString, void *scalar); @@ -451,14 +448,16 @@ void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const ocl void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) { - String kernelName = mask.data ? "arithm_s_sub_with_mask" : "arithm_s_sub"; - const char **kernelString = mask.data ? &arithm_sub_scalar_mask : &arithm_sub_scalar; + String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add"; + const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar; + arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1); } void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask) { - String kernelName = mask.data ? "arithm_s_sub_with_mask" : "arithm_s_sub"; - const char **kernelString = mask.data ? &arithm_sub_scalar_mask : &arithm_sub_scalar; + String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add"; + const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar; + arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1); } void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst) diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index 432dac686f..37a69cece5 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -394,6 +394,15 @@ namespace cv } break; + case IS_CPU_DEVICE: + { + cl_device_type devicetype; + openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], + CL_DEVICE_TYPE, sizeof(cl_device_type), + &devicetype, NULL)); + *(bool*)info = (devicetype == CVCL_DEVICE_TYPE_CPU); + } + break; default: CV_Error(-1, "Invalid device info type"); break; diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index 7177050581..7cd596f333 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -393,7 +393,7 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be if( rtype < 0 ) rtype = type(); else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); + rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), oclchannels()); //int scn = channels(); int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype); diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl index 647171578d..7d4b0a7653 100644 --- a/modules/ocl/src/opencl/arithm_add.cl +++ b/modules/ocl/src/opencl/arithm_add.cl @@ -52,6 +52,11 @@ #endif #endif +#ifdef ARITHM_ADD + #define ARITHM_OP(A,B) ((A)+(B)) +#elif defined ARITHM_SUB + #define ARITHM_OP(A,B) ((A)-(B)) +#endif ////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////ADD//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -95,7 +100,7 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); + short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); uchar4 tmp_data = convert_uchar4_sat(tmp); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -134,7 +139,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) + convert_int4_sat(src2_data); + int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data)); ushort4 tmp_data = convert_ushort4_sat(tmp); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -172,7 +177,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) + convert_int4_sat(src2_data); + int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data)); short4 tmp_data = convert_short4_sat(tmp); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -200,7 +205,7 @@ __kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset, int data1 = *((__global int *)((__global char *)src1 + src1_index)); int data2 = *((__global int *)((__global char *)src2 + src2_index)); - long tmp = (long)(data1) + (long)(data2); + long tmp = ARITHM_OP((long)(data1), (long)(data2)); *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp); } @@ -221,7 +226,7 @@ __kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offse float data1 = *((__global float *)((__global char *)src1 + src1_index)); float data2 = *((__global float *)((__global char *)src2 + src2_index)); - float tmp = data1 + data2; + float tmp = ARITHM_OP(data1, data2); *((__global float *)((__global char *)dst + dst_index)) = tmp; } @@ -245,7 +250,7 @@ __kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offs double data1 = *((__global double *)((__global char *)src1 + src1_index)); double data2 = *((__global double *)((__global char *)src2 + src2_index)); - *((__global double *)((__global char *)dst + dst_index)) = data1 + data2; + *((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2); } } #endif @@ -302,7 +307,7 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i } uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); + short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); uchar4 tmp_data = convert_uchar4_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; @@ -344,7 +349,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, uchar2 mask_data = vload2(0, mask + mask_index); ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) + convert_int2_sat(src2_data); + int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data)); ushort2 tmp_data = convert_ushort2_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; @@ -384,7 +389,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i uchar2 mask_data = vload2(0, mask + mask_index); short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) + convert_int2_sat(src2_data); + int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data)); short2 tmp_data = convert_short2_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; @@ -416,7 +421,7 @@ __kernel void arithm_add_with_mask_C1_D4 (__global int *src1, int src1_step, i int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); int dst_data = *((__global int *)((__global char *)dst + dst_index)); - int data = convert_int_sat((long)src_data1 + (long)src_data2); + int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2)); data = mask_data ? data : dst_data; *((__global int *)((__global char *)dst + dst_index)) = data; @@ -446,7 +451,7 @@ __kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, i float src_data2 = *((__global float *)((__global char *)src2 + src2_index)); float dst_data = *((__global float *)((__global char *)dst + dst_index)); - float data = src_data1 + src_data2; + float data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global float *)((__global char *)dst + dst_index)) = data; @@ -477,7 +482,7 @@ __kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step, double src_data2 = *((__global double *)((__global char *)src2 + src2_index)); double dst_data = *((__global double *)((__global char *)dst + dst_index)); - double data = src_data1 + src_data2; + double data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global double *)((__global char *)dst + dst_index)) = data; @@ -516,7 +521,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i uchar2 mask_data = vload2(0, mask + mask_index); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); + short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); uchar4 tmp_data = convert_uchar4_sat(tmp); data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; @@ -548,7 +553,7 @@ __kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step, ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2); + int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2)); ushort2 data = convert_ushort2_sat(tmp); data = mask_data ? data : dst_data; @@ -578,7 +583,7 @@ __kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, i short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2); + int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2)); short2 data = convert_short2_sat(tmp); data = mask_data ? data : dst_data; @@ -608,7 +613,7 @@ __kernel void arithm_add_with_mask_C2_D4 (__global int *src1, int src1_step, i int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2)); + int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2))); data = mask_data ? data : dst_data; *((__global int2 *)((__global char *)dst + dst_index)) = data; @@ -637,7 +642,7 @@ __kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, i float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index)); float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - float2 data = src_data1 + src_data2; + float2 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global float2 *)((__global char *)dst + dst_index)) = data; @@ -668,329 +673,14 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index)); double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - double2 data = src_data1 + src_data2; + double2 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global double2 *)((__global char *)dst + dst_index)) = data; } } #endif -__kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) + convert_short4_sat(src2_data_0)); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) + convert_short4_sat(src2_data_1)); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2)); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0)); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1)); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0)); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1)); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0)); - float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4)); - float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 + src2_data_0; - float tmp_data_1 = src1_data_1 + src2_data_1; - float tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 )); - double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 )); - double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 + src2_data_0; - double tmp_data_1 = src1_data_1 + src2_data_1; - double tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1014,7 +704,7 @@ __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, i uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - uchar4 data = convert_uchar4_sat(convert_ushort4_sat(src_data1) + convert_ushort4_sat(src_data2)); + uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2))); data = mask_data ? data : dst_data; *((__global uchar4 *)(dst + dst_index)) = data; @@ -1043,7 +733,7 @@ __kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step, ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2)); + ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2))); data = mask_data ? data : dst_data; *((__global ushort4 *)((__global char *)dst + dst_index)) = data; @@ -1072,7 +762,7 @@ __kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, i short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - short4 data = convert_short4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2)); + short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2))); data = mask_data ? data : dst_data; *((__global short4 *)((__global char *)dst + dst_index)) = data; @@ -1101,7 +791,7 @@ __kernel void arithm_add_with_mask_C4_D4 (__global int *src1, int src1_step, i int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src_data2)); + int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2))); data = mask_data ? data : dst_data; *((__global int4 *)((__global char *)dst + dst_index)) = data; @@ -1130,7 +820,7 @@ __kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, i float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - float4 data = src_data1 + src_data2; + float4 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global float4 *)((__global char *)dst + dst_index)) = data; @@ -1161,7 +851,7 @@ __kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step, double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index)); double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - double4 data = src_data1 + src_data2; + double4 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global double4 *)((__global char *)dst + dst_index)) = data; diff --git a/modules/ocl/src/opencl/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl index 944442b0f1..49242d07c7 100644 --- a/modules/ocl/src/opencl/arithm_flip.cl +++ b/modules/ocl/src/opencl/arithm_flip.cl @@ -330,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src if (x < thread_cols && y < rows) { int src_index_0 = mad24(y, src_step, (x) + src_offset); - int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset); - - int dst_index_0 = mad24(y, dst_step, (x) + dst_offset); int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset); - uchar data0 = *(src + src_index_0); - uchar data1 = *(src + src_index_1); - - *(dst + dst_index_0) = data1; *(dst + dst_index_1) = data0; + + int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset); + int dst_index_0 = mad24(y, dst_step, (x) + dst_offset); + uchar data1 = *(src + src_index_1); + *(dst + dst_index_0) = data1; } } __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset, diff --git a/modules/ocl/src/opencl/arithm_sub.cl b/modules/ocl/src/opencl/arithm_sub.cl deleted file mode 100644 index 9cf37970b2..0000000000 --- a/modules/ocl/src/opencl/arithm_sub.cl +++ /dev/null @@ -1,1104 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jia Haipeng, jiahaipeng95@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif - -////////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////SUB//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************sub without mask**************************************/ -__kernel void arithm_sub_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) - convert_short4_sat(src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global uchar4 *)(dst + dst_index)) = dst_data; - } -} -__kernel void arithm_sub_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - convert_int4_sat(src2_data); - ushort4 tmp_data = convert_ushort4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; - } -} -__kernel void arithm_sub_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - convert_int4_sat(src2_data); - short4 tmp_data = convert_short4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global short4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - -__kernel void arithm_sub_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int data1 = *((__global int *)((__global char *)src1 + src1_index)); - int data2 = *((__global int *)((__global char *)src2 + src2_index)); - long tmp = (long)(data1) - (long)(data2); - - *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp); - } -} -__kernel void arithm_sub_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float data1 = *((__global float *)((__global char *)src1 + src1_index)); - float data2 = *((__global float *)((__global char *)src2 + src2_index)); - float tmp = data1 - data2; - - *((__global float *)((__global char *)dst + dst_index)) = tmp; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double data1 = *((__global double *)((__global char *)src1 + src1_index)); - double data2 = *((__global double *)((__global char *)src2 + src2_index)); - - *((__global double *)((__global char *)dst + dst_index)) = data1 - data2; - } -} -#endif - -/**************************************sub with mask**************************************/ -__kernel void arithm_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) - convert_short4_sat(src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - convert_int2_sat(src2_data); - ushort2 tmp_data = convert_ushort2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - convert_int2_sat(src2_data); - short2 tmp_data = convert_short2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = convert_int_sat((long)src_data1 - (long)src_data2); - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - -__kernel void arithm_sub_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = *((__global float *)((__global char *)src2 + src2_index)); - float dst_data = *((__global float *)((__global char *)dst + dst_index)); - - float data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global float *)((__global char *)dst + dst_index)) = data; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src_data2 = *((__global double *)((__global char *)src2 + src2_index)); - double dst_data = *((__global double *)((__global char *)dst + dst_index)); - - double data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) - convert_short4_sat(src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2); - ushort2 data = convert_ushort2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2); - short2 data = convert_short2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = convert_int2_sat(convert_long2_sat(src_data1) - convert_long2_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index)); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - - float2 data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global float2 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index)); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - - double2 data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global double2 *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) - convert_short4_sat(src2_data_0)); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) - convert_short4_sat(src2_data_1)); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) - convert_short4_sat(src2_data_2)); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) - convert_int2_sat(src2_data_0)); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) - convert_int2_sat(src2_data_1)); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) - convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) - convert_int2_sat(src2_data_0)); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) - convert_int2_sat(src2_data_1)); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) - convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 - (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 - (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 - (long)src2_data_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0)); - float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4)); - float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 - src2_data_0; - float tmp_data_1 = src1_data_1 - src2_data_1; - float tmp_data_2 = src1_data_2 - src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 )); - double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 )); - double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif -__kernel void arithm_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 data = convert_uchar4_sat(convert_short4_sat(src_data1) - convert_short4_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = convert_short4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - int4 data = convert_int4_sat(convert_long4_sat(src_data1) - convert_long4_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index)); - float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - - float4 data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global float4 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 5) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index)); - double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - - double4 data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global double4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_sub_scalar.cl b/modules/ocl/src/opencl/arithm_sub_scalar.cl deleted file mode 100644 index 782bcd0607..0000000000 --- a/modules/ocl/src/opencl/arithm_sub_scalar.cl +++ /dev/null @@ -1,806 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jia Haipeng, jiahaipeng95@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif -/**************************************sub with scalar without mask**************************************/ -__kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - ushort2 tmp_data = convert_ushort2_sat(tmp); - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - short2 tmp_data = convert_short2_sat(tmp); - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - - long tmp = (long)src_data1 - (long)src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - int data = convert_int_sat(tmp); - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = src2.x; - - float tmp = src_data1 - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - - *((__global float *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src2_data = src2.x; - - double data = src_data1 - src2_data; - data = isMatSubScalar ? data : -data; - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_s_sub_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; - data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - ushort2 data = convert_ushort2_sat(tmp); - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - short2 data = convert_short2_sat(tmp); - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2); - tmp = isMatSubScalar ? tmp : -tmp; - int2 data = convert_int2_sat(tmp); - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = (float2)(src2.x, src2.y); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - - float2 tmp = src_data1 - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - - *((__global float2 *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = (double2)(src2.x, src2.y); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - - double2 data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - - *((__global double2 *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0; - int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1; - int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0); - uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1); - uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2); - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0); - ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1); - ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - short2 tmp_data_0 = convert_short2_sat(tmp_0); - short2 tmp_data_1 = convert_short2_sat(tmp_1); - short2 tmp_data_2 = convert_short2_sat(tmp_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - long tmp_0 = (long)src1_data_0 - (long)src2_data_0; - long tmp_1 = (long)src1_data_1 - (long)src2_data_1; - long tmp_2 = (long)src1_data_2 - (long)src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - int tmp_data_0 = convert_int_sat(tmp_0); - int tmp_data_1 = convert_int_sat(tmp_1); - int tmp_data_2 = convert_int_sat(tmp_2); - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_sub_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_0 = src1_data_0 - src2_data_0; - float tmp_1 = src1_data_1 - src2_data_1; - float tmp_2 = src1_data_2 - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= tmp_0; - *((__global float *)((__global char *)dst + dst_index + 4))= tmp_1; - *((__global float *)((__global char *)dst + dst_index + 8))= tmp_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif -__kernel void arithm_s_sub_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 data = convert_uchar4_sat(tmp); - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - ushort4 data = convert_ushort4_sat(tmp); - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - short4 data = convert_short4_sat(tmp); - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - - long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2); - tmp = isMatSubScalar ? tmp : -tmp; - int4 data = convert_int4_sat(tmp); - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - - float4 tmp = src_data1 - src2; - tmp = isMatSubScalar ? tmp : -tmp; - - *((__global float4 *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - - double4 data = src_data1 - src2; - data = isMatSubScalar ? data : -data; - - *((__global double4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl deleted file mode 100644 index 135354993c..0000000000 --- a/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl +++ /dev/null @@ -1,941 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jia Haipeng, jiahaipeng95@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif - -/**************************************sub with scalar with mask**************************************/ -__kernel void arithm_s_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x); - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - ushort2 tmp_data = convert_ushort2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - short2 tmp_data = convert_short2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - long tmp = (long)src_data1 - (long)src_data2; - tmp = isMatSubScalar ? tmp : - tmp; - int data = convert_int_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - -__kernel void arithm_s_sub_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = src2.x; - float dst_data = *((__global float *)((__global char *)dst + dst_index)); - - float data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global float *)((__global char *)dst + dst_index)) = data; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src_data2 = src2.x; - double dst_data = *((__global double *)((__global char *)dst + dst_index)); - - double data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - ushort2 data = convert_ushort2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - short2 data = convert_short2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2); - tmp = isMatSubScalar ? tmp : -tmp; - int2 data = convert_int2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = (float2)(src2.x, src2.y); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - - float2 data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global float2 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = (double2)(src2.x, src2.y); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - - double2 data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global double2 *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0; - int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1; - int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0); - uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1); - uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0); - ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1); - ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - short2 tmp_data_0 = convert_short2_sat(tmp_0); - short2 tmp_data_1 = convert_short2_sat(tmp_1); - short2 tmp_data_2 = convert_short2_sat(tmp_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - long tmp_0 = (long)src1_data_0 - (long)src2_data_0; - long tmp_1 = (long)src1_data_1 - (long)src2_data_1; - long tmp_2 = (long)src1_data_2 - (long)src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - int tmp_data_0 = convert_int_sat(tmp_0); - int tmp_data_1 = convert_int_sat(tmp_1); - int tmp_data_2 = convert_int_sat(tmp_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 - src2_data_0; - float tmp_data_1 = src1_data_1 - src2_data_1; - float tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif -__kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 data = convert_uchar4_sat(tmp); - - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - ushort4 data = convert_ushort4_sat(tmp); - - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - short4 data = convert_short4_sat(tmp); - - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2); - tmp = isMatSubScalar ? tmp : -tmp; - int4 data = convert_int4_sat(tmp); - - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - - float4 data = src_data1 - src2; - data = isMatSubScalar ? data : -data; - - data = mask_data ? data : dst_data; - - *((__global float4 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - - double4 data = src_data1 - src2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global double4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index e76fb1d21e..fc7e5159fd 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -1,5 +1,58 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Nathan, liujun@multicorewareinc.com +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable -#define MAX_FLOAT 1e7f +#define MAX_FLOAT 3.40282e+038f + +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 16 +#endif +#ifndef MAX_DESC_LEN +#define MAX_DESC_LEN 64 +#endif int bit1Count(float x) { @@ -13,83 +66,52 @@ int bit1Count(float x) return (float)c; } +#ifndef DIST_TYPE +#define DIST_TYPE 0 +#endif + +#if (DIST_TYPE == 0) +#define DIST(x, y) fabs((x) - (y)) +#elif (DIST_TYPE == 1) +#define DIST(x, y) (((x) - (y)) * ((x) - (y))) +#elif (DIST_TYPE == 2) +#define DIST(x, y) bit1Count((uint)(x) ^ (uint)(y)) +#endif + + float reduce_block(__local float *s_query, __local float *s_train, - int block_size, int lidx, - int lidy, - int distType + int lidy ) { - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ float result = 0; - switch(distType) + #pragma unroll + for (int j = 0 ; j < BLOCK_SIZE ; j++) { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); - } - break; + result += DIST(s_query[lidy * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]); } return result; } float reduce_multi_block(__local float *s_query, __local float *s_train, - int max_desc_len, - int block_size, int block_index, int lidx, - int lidy, - int distType + int lidy ) { - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ float result = 0; - switch(distType) + #pragma unroll + for (int j = 0 ; j < BLOCK_SIZE ; j++) { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - } - break; + result += DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]); } return result; } -/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size -local size: dim0 is block_size, dim1 is block_size. +/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE +local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE. */ __kernel void BruteForceMatch_UnrollMatch_D5( __global float *query, @@ -98,29 +120,28 @@ __kernel void BruteForceMatch_UnrollMatch_D5( __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { + const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * max_desc_len; + __local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; - int queryIdx = groupidx * block_size + lidy; + int queryIdx = groupidx * BLOCK_SIZE + lidy; // load the query into local memory. - for (int i = 0 ; i < max_desc_len / block_size; i ++) + #pragma unroll + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++) { - int loadx = lidx + i * block_size; - s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + int loadx = lidx + i * BLOCK_SIZE; + s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; } float myBestDistance = MAX_FLOAT; @@ -128,24 +149,25 @@ __kernel void BruteForceMatch_UnrollMatch_D5( // loopUnrolledCached to find the best trainIdx and best distance. volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++) { float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) + #pragma unroll + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++) { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } - int trainIdx = t * block_size + lidx; + int trainIdx = t * BLOCK_SIZE + lidx; if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) { @@ -157,18 +179,19 @@ __kernel void BruteForceMatch_UnrollMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float*)(sharebuffer); - __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); //find BestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; s_distance[lidx] = myBestDistance; s_trainIdx[lidx] = myBestTrainIdx; barrier(CLK_LOCAL_MEM_FENCE); //reduce -- now all reduce implement in each threads. - for (int k = 0 ; k < block_size; k++) + #pragma unroll + for (int k = 0 ; k < BLOCK_SIZE; k++) { if (myBestDistance > s_distance[k]) { @@ -191,53 +214,51 @@ __kernel void BruteForceMatch_Match_D5( __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); - const int queryIdx = groupidx * block_size + lidy; + const int queryIdx = groupidx * BLOCK_SIZE + lidy; float myBestDistance = MAX_FLOAT; int myBestTrainIdx = -1; __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; + __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE; // loop - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) { //Dist dist; float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) + for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++) { - const int loadx = lidx + i * block_size; + const int loadx = lidx + i * BLOCK_SIZE; //load query and train into local memory - s_query[lidy * block_size + lidx] = 0; - s_train[lidx * block_size + lidy] = 0; + s_query[lidy * BLOCK_SIZE + lidx] = 0; + s_train[lidx * BLOCK_SIZE + lidy] = 0; if (loadx < query_cols) { - s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; } barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } - const int trainIdx = t * block_size + lidx; + const int trainIdx = t * BLOCK_SIZE + lidx; if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) { @@ -250,18 +271,18 @@ __kernel void BruteForceMatch_Match_D5( barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); //findBestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; s_distance[lidx] = myBestDistance; s_trainIdx[lidx] = myBestTrainIdx; barrier(CLK_LOCAL_MEM_FENCE); //reduce -- now all reduce implement in each threads. - for (int k = 0 ; k < block_size; k++) + for (int k = 0 ; k < BLOCK_SIZE; k++) { if (myBestDistance > s_distance[k]) { @@ -287,16 +308,13 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( __global float *bestDistance, __global int *nMatches, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, int train_cols, int bestTrainIdx_cols, int step, - int ostep, - int distType + int ostep ) { const int lidx = get_local_id(0); @@ -304,25 +322,25 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( const int groupidx = get_group_id(0); const int groupidy = get_group_id(1); - const int queryIdx = groupidy * block_size + lidy; - const int trainIdx = groupidx * block_size + lidx; + const int queryIdx = groupidy * BLOCK_SIZE + lidy; + const int trainIdx = groupidx * BLOCK_SIZE + lidx; __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; + __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE; float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; ++i) + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i) { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; - s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -350,15 +368,13 @@ __kernel void BruteForceMatch_RadiusMatch_D5( __global float *bestDistance, __global int *nMatches, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, int train_cols, int bestTrainIdx_cols, int step, - int ostep, - int distType + int ostep ) { const int lidx = get_local_id(0); @@ -366,25 +382,25 @@ __kernel void BruteForceMatch_RadiusMatch_D5( const int groupidx = get_group_id(0); const int groupidy = get_group_id(1); - const int queryIdx = groupidy * block_size + lidy; - const int trainIdx = groupidx * block_size + lidx; + const int queryIdx = groupidy * BLOCK_SIZE + lidy; + const int trainIdx = groupidx * BLOCK_SIZE + lidx; __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; + __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE; float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) + for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i) { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; - s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -410,29 +426,26 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); - const int queryIdx = groupidx * block_size + lidy; + const int queryIdx = groupidx * BLOCK_SIZE + lidy; local float *s_query = sharebuffer; - local float *s_train = sharebuffer + block_size * max_desc_len; + local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; // load the query into local memory. - for (int i = 0 ; i < max_desc_len / block_size; i ++) + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++) { - int loadx = lidx + i * block_size; - s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + int loadx = lidx + i * BLOCK_SIZE; + s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; } float myBestDistance1 = MAX_FLOAT; @@ -442,25 +455,25 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( //loopUnrolledCached volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) { float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++) { - const int loadX = lidx + i * block_size; - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + const int loadX = lidx + i * BLOCK_SIZE; + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } - const int trainIdx = t * block_size + lidx; + const int trainIdx = t * BLOCK_SIZE + lidx; if (queryIdx < query_rows && trainIdx < train_rows) { @@ -482,11 +495,11 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); local float *s_distance = (local float *)sharebuffer; - local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); + local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); // find BestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; s_distance[lidx] = myBestDistance1; s_trainIdx[lidx] = myBestTrainIdx1; @@ -499,7 +512,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( if (lidx == 0) { - for (int i = 0 ; i < block_size ; i++) + for (int i = 0 ; i < BLOCK_SIZE ; i++) { float val = s_distance[i]; if (val < bestDistance1) @@ -527,7 +540,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( if (lidx == 0) { - for (int i = 0 ; i < block_size ; i++) + for (int i = 0 ; i < BLOCK_SIZE ; i++) { float val = s_distance[i]; @@ -559,22 +572,20 @@ __kernel void BruteForceMatch_knnMatch_D5( __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); - const int queryIdx = groupidx * block_size + lidy; + const int queryIdx = groupidx * BLOCK_SIZE + lidy; local float *s_query = sharebuffer; - local float *s_train = sharebuffer + block_size * block_size; + local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE; float myBestDistance1 = MAX_FLOAT; float myBestDistance2 = MAX_FLOAT; @@ -582,30 +593,30 @@ __kernel void BruteForceMatch_knnMatch_D5( int myBestTrainIdx2 = -1; //loop - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) { float result = 0.0f; - for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++) + for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++) { - const int loadx = lidx + i * block_size; + const int loadx = lidx + i * BLOCK_SIZE; //load query and train into local memory - s_query[lidy * block_size + lidx] = 0; - s_train[lidx * block_size + lidy] = 0; + s_query[lidy * BLOCK_SIZE + lidx] = 0; + s_train[lidx * BLOCK_SIZE + lidy] = 0; if (loadx < query_cols) { - s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; } barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } - const int trainIdx = t * block_size + lidx; + const int trainIdx = t * BLOCK_SIZE + lidx; if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) { @@ -627,11 +638,11 @@ __kernel void BruteForceMatch_knnMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); //findBestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; s_distance[lidx] = myBestDistance1; s_trainIdx[lidx] = myBestTrainIdx1; @@ -644,7 +655,7 @@ __kernel void BruteForceMatch_knnMatch_D5( if (lidx == 0) { - for (int i = 0 ; i < block_size ; i++) + for (int i = 0 ; i < BLOCK_SIZE ; i++) { float val = s_distance[i]; if (val < bestDistance1) @@ -672,7 +683,7 @@ __kernel void BruteForceMatch_knnMatch_D5( if (lidx == 0) { - for (int i = 0 ; i < block_size ; i++) + for (int i = 0 ; i < BLOCK_SIZE ; i++) { float val = s_distance[i]; @@ -703,14 +714,11 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5( //__global float *mask, __global float *allDist, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, int train_cols, - int step, - int distType) + int step) { /* Todo */ } @@ -721,13 +729,11 @@ kernel void BruteForceMatch_calcDistance_D5( //__global float *mask, __global float *allDist, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, int train_cols, - int step, - int distType) + int step) { /* Todo */ } @@ -736,8 +742,7 @@ kernel void BruteForceMatch_findBestMatch_D5( __global float *allDist, __global int *bestTrainIdx, __global float *bestDistance, - int k, - int block_size + int k ) { /* Todo */ diff --git a/modules/ocl/src/opencl/filter_sep_row.cl b/modules/ocl/src/opencl/filter_sep_row.cl index bfe6cd4dd6..9dc498399d 100644 --- a/modules/ocl/src/opencl/filter_sep_row.cl +++ b/modules/ocl/src/opencl/filter_sep_row.cl @@ -96,18 +96,18 @@ The info above maybe obsolete. ***********************************************************************************/ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 - (__global const uchar * restrict src, - __global float * dst, - const int dst_cols, - const int dst_rows, - const int src_whole_cols, - const int src_whole_rows, - const int src_step_in_pixel, - const int src_offset_x, - const int src_offset_y, - const int dst_step_in_pixel, - const int radiusy, - __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) +(__global const uchar * restrict src, + __global float * dst, + const int dst_cols, + const int dst_rows, + const int src_whole_cols, + const int src_whole_rows, + const int src_step_in_pixel, + const int src_offset_x, + const int src_offset_y, + const int dst_step_in_pixel, + const int radiusy, + __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) { int x = get_global_id(0)<<2; int y = get_global_id(1); @@ -122,17 +122,17 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ uchar4 temp[READ_TIMES_ROW]; __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); //read pixels from src - for(i = 0;i 0)) ? current_addr : 0; temp[i] = *(__global uchar4*)&src[current_addr]; } //judge if read out of boundary - for(i = 0;isrc_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); int4 index[READ_TIMES_ROW]; int4 addr; @@ -148,7 +148,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ if(not_all_in_range) { //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); - sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); - } + data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); + sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); + } } } @@ -207,7 +207,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w; *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum); } - } + } } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////32FC1//////////////////////////////////////////////////////// @@ -225,7 +225,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x int groupX_size = get_local_size(0); int groupX_id = get_group_id(0); - #define dst_align (dst_offset_x & 3) +#define dst_align (dst_offset_x & 3) int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; @@ -236,7 +236,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -254,7 +254,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -272,7 +272,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #endif +#endif } } } @@ -295,17 +295,17 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x for(int i = 0; i < ANCHOR; i++) { - #pragma unroll 3 - for(int j = 0; j < ANCHOR; j++) - { +#pragma unroll 3 + for(int j = 0; j < ANCHOR; j++) + { if(dst_rows_index < dst_rows_end) { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); - sum = sum + (mat_kernel[i * ANCHOR + j] * data); - } + data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); + sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data); + } } } @@ -318,7 +318,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum; } - } + } } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -337,7 +337,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ int groupX_size = get_local_size(0); int groupX_id = get_group_id(0); - #define dst_align (dst_offset_x & 3) +#define dst_align (dst_offset_x & 3) int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; @@ -349,7 +349,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -367,7 +367,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -386,7 +386,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #endif +#endif } } } @@ -410,17 +410,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ for(int i = 0; i < ANCHOR; i++) { - #pragma unroll 3 - for(int j = 0; j < ANCHOR; j++) - { +#pragma unroll 3 + for(int j = 0; j < ANCHOR; j++) + { if(dst_rows_index < dst_rows_end) { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); - sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); - } + data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); + sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); + } } } @@ -468,7 +468,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -486,7 +486,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -504,7 +504,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; } - #endif +#endif } } } @@ -519,10 +519,10 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ for(int i = 0; i < ANCHOR; i++) { - for(int j = 0; j < ANCHOR; j++) - { - int local_cols = lX + j; - sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]; + for(int j = 0; j < ANCHOR; j++) + { + int local_cols = lX + j; + sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]); } } diff --git a/modules/ocl/src/opencl/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl index 80f460b86e..c546957687 100644 --- a/modules/ocl/src/opencl/imgproc_integral.cl +++ b/modules/ocl/src/opencl/imgproc_integral.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif #define LSIZE 256 #define LSIZE_1 255 @@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float gid = gid << 1; for(int i = 0; i < rows; i =i + LSIZE_1) { - src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0); - src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0); + src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0); + src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0); sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); barrier(CLK_LOCAL_MEM_FENCE); int bf_loc = lid + GET_CONFLICT_OFFSET(lid); @@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float } barrier(CLK_LOCAL_MEM_FENCE); int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; - if(lid > 0 && (i+lid) <= rows){ + if(lid > 0 && (i+lid) <= rows) + { lm_sum[0][bf_loc] += sum_t[0]; lm_sum[1][bf_loc] += sum_t[1]; lm_sqsum[0][bf_loc] += sqsum_t[0]; @@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo src_step = src_step >> 4; for(int i = 0; i < rows; i =i + LSIZE_1) { - src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0; - sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0; - src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0; - sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0; + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0; + sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0; + sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); barrier(CLK_LOCAL_MEM_FENCE); int bf_loc = lid + GET_CONFLICT_OFFSET(lid); @@ -228,14 +233,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo barrier(CLK_LOCAL_MEM_FENCE); if(gid == 0 && (i + lid) <= rows) { - sum[sum_offset + i + lid] = 0; - sqsum[sqsum_offset + i + lid] = 0; + sum[sum_offset + i + lid] = 0; + sqsum[sqsum_offset + i + lid] = 0; } if(i + lid == 0) { int loc0 = gid * 2 * sum_step; int loc1 = gid * 2 * sqsum_step; - for(int k = 1;k <= 8;k++) + for(int k = 1; k <= 8; k++) { if(gid * 8 + k > cols) break; sum[sum_offset + loc0 + k * sum_step / 4] = 0; @@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo } int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; - if(lid > 0 && (i+lid) <= rows){ + if(lid > 0 && (i+lid) <= rows) + { lm_sum[0][bf_loc] += sum_t[0]; lm_sum[1][bf_loc] += sum_t[1]; lm_sqsum[0][bf_loc] += sqsum_t[0]; diff --git a/modules/ocl/src/opencl/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl index 8aee1838c4..16971e252b 100644 --- a/modules/ocl/src/opencl/imgproc_warpAffine.cl +++ b/modules/ocl/src/opencl/imgproc_warpAffine.cl @@ -47,8 +47,12 @@ //warpAffine kernel //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. -#if defined DOUBLE_SUPPORT +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif typedef double F; typedef double4 F4; #define convert_F4 convert_double4 @@ -58,7 +62,6 @@ typedef float4 F4; #define convert_F4 convert_float4 #endif - #define INTER_BITS 5 #define INTER_TAB_SIZE (1 << INTER_BITS) #define INTER_SCALE 1.f/INTER_TAB_SIZE @@ -81,8 +84,8 @@ inline void interpolateCubic( float x, float* coeffs ) /**********************************************8UC1********************************************* ***********************************************************************************************/ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -123,14 +126,14 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon != 0) ? sval : dval; + dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; *d = dval; } } __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -206,16 +209,16 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob taby = INTER_SCALE * convert_float4(ay); tabx = INTER_SCALE * convert_float4(ax); - itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); - itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab3 = convert_short4_sat(( taby*tabx * INTER_REMAP_COEF_SCALE )); + itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); + itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE )); + itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); + itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE )); int4 val; uchar4 tval; val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) - + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); + + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); @@ -228,8 +231,8 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob } __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -255,10 +258,10 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; + } short itab[16]; float tab1y[4], tab1x[4]; @@ -288,7 +291,7 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); } @@ -309,8 +312,8 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, ***********************************************************************************************/ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -333,8 +336,8 @@ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global } __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -386,8 +389,8 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo } __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -418,10 +421,10 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob int i,j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } int itab[16]; float tab1y[4], tab1x[4]; float axx, ayy; @@ -447,14 +450,14 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob int diff = isum - INTER_REMAP_COEF_SCALE; int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) + for( k1 = 2; k1 < 4; k1++ ) for( k2 = 2; k2 < 4; k2++ ) { if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); @@ -477,8 +480,8 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob ***********************************************************************************************/ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -501,8 +504,8 @@ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int } __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -548,12 +551,12 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) dst[(dst_offset>>2)+dy*dstStep+dx] = sum; - } + } } __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -617,8 +620,8 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, ***********************************************************************************************/ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -636,13 +639,13 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i short sy0 = (short)(Y0 >> AB_BITS); if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>4)+sy0*(srcStep>>2)+sx0] : 0; + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>4)+sy0*(srcStep>>2)+sx0] : (float4)0; } } __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -670,10 +673,10 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds float4 v0, v1, v2, v3; - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; float tab[4]; float taby[2], tabx[2]; @@ -691,12 +694,12 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) dst[dst_offset+dy*dstStep+dx] = sum; - } + } } __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -726,7 +729,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -754,5 +757,5 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst dst[dst_offset+dy*dstStep+dx] = sum; } - } + } } diff --git a/modules/ocl/src/opencl/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl index a37ffa1bee..ef9e77058c 100644 --- a/modules/ocl/src/opencl/imgproc_warpPerspective.cl +++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl @@ -47,8 +47,12 @@ //wrapPerspective kernel //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. -#if defined DOUBLE_SUPPORT +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif typedef double F; typedef double4 F4; #define convert_F4 convert_double4 @@ -81,8 +85,8 @@ inline void interpolateCubic( float x, float* coeffs ) /**********************************************8UC1********************************************* ***********************************************************************************************/ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -112,14 +116,14 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon != 0) ? sval : dval; + dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; *d = dval; } } __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -142,7 +146,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ int i; #pragma unroll 4 for(i=0; i<4; i++) - v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0; + v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0; short itab[4]; float tab1y[2], tab1x[2]; @@ -170,8 +174,8 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ } __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -190,15 +194,15 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * short ay = (short)(Y & (INTER_TAB_SIZE-1)); short ax = (short)(X & (INTER_TAB_SIZE-1)); - uchar v[16]; + uchar v[16]; int i, j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0; + } short itab[16]; float tab1y[4], tab1x[4]; @@ -227,7 +231,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); } @@ -249,8 +253,8 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * ***********************************************************************************************/ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -273,8 +277,8 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl } __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -299,10 +303,10 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, int4 v0, v1, v2, v3; - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0; + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0; int itab0, itab1, itab2, itab3; float taby, tabx; @@ -323,8 +327,8 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, } __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -352,10 +356,10 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ int i,j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } int itab[16]; float tab1y[4], tab1x[4]; float axx, ayy; @@ -381,14 +385,14 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ int diff = isum - INTER_REMAP_COEF_SCALE; int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) + for( k1 = 2; k1 < 4; k1++ ) for( k2 = 2; k2 < 4; k2++ ) { if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); @@ -411,8 +415,8 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ ***********************************************************************************************/ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -434,8 +438,8 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst } __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -458,10 +462,10 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * float v0, v1, v2, v3; - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0; + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0; float tab[4]; float taby[2], tabx[2]; @@ -483,8 +487,8 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * } __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -510,7 +514,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -546,8 +550,8 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * ***********************************************************************************************/ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -564,13 +568,13 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d short sy = (short)Y; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : 0; + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : (float)0; } } __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -597,10 +601,10 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 float4 v0, v1, v2, v3; - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; float tab[4]; float taby[2], tabx[2]; @@ -622,8 +626,8 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 } __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -652,7 +656,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -680,5 +684,5 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 dst[dst_offset+dy*dstStep+dx] = sum; } - } + } } diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl index 857f891c38..0dd3e69c40 100644 --- a/modules/ocl/src/opencl/match_template.cl +++ b/modules/ocl/src/opencl/match_template.cl @@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0 __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); for(j = 0; j < tpl_cols; j ++) { - sum = mad24(img_ptr[j], tpl_ptr[j], sum); + sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum); } } - res[res_idx] = sum; + res[res_idx] = (float)sum; } } @@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0 sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); } } - res[res_idx] = sum.x + sum.y + sum.z + sum.w; + res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w); } } @@ -633,9 +633,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0 if(gidx < res_cols && gidy < res_rows) { - float sum = (float)( - (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); res[res_idx] -= sum * tpl_sum; } } diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index db11ed1410..64ae3ea980 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -53,76 +53,96 @@ //---------------------------------------------------------------------------- // Histogram computation - -__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y, - const int cnbins, const int cblock_hist_size, const int img_block_width, - const int grad_quadstep, const int qangle_step, - __global const float* grad, __global const uchar* qangle, - const float scale, __global float* block_hists, __local float* smem) +// 12 threads for a cell, 12x4 threads per block +__kernel void compute_hists_kernel( + const int cblock_stride_x, const int cblock_stride_y, + const int cnbins, const int cblock_hist_size, const int img_block_width, + const int blocks_in_group, const int blocks_total, + const int grad_quadstep, const int qangle_step, + __global const float* grad, __global const uchar* qangle, + const float scale, __global float* block_hists, __local float* smem) { - const int lidX = get_local_id(0); + const int lx = get_local_id(0); + const int lp = lx / 24; /* local group id */ + const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */ + const int gidY = gid / img_block_width; + const int gidX = gid - gidY * img_block_width; + + const int lidX = lx - lp * 24; const int lidY = get_local_id(1); - const int gidX = get_group_id(0); - const int gidY = get_group_id(1); - const int cell_x = lidX / 16; + const int cell_x = lidX / 12; const int cell_y = lidY; - const int cell_thread_x = lidX & 0xF; + const int cell_thread_x = lidX - cell_x * 12; - __local float* hists = smem; - __local float* final_hist = smem + cnbins * 48; + __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * + CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y); + __local float* final_hist = hists + cnbins * + (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12); const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; const int offset_y = gidY * cblock_stride_y + (cell_y << 2); - __global const float* grad_ptr = grad + offset_y * grad_quadstep + (offset_x << 1); - __global const uchar* qangle_ptr = qangle + offset_y * qangle_step + (offset_x << 1); + __global const float* grad_ptr = (gid < blocks_total) ? + grad + offset_y * grad_quadstep + (offset_x << 1) : grad; + __global const uchar* qangle_ptr = (gid < blocks_total) ? + qangle + offset_y * qangle_step + (offset_x << 1) : qangle; - // 12 means that 12 pixels affect on block's cell (in one row) - if (cell_thread_x < 12) + __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + + cell_thread_x; + for (int bin_id = 0; bin_id < cnbins; ++bin_id) + hist[bin_id * 48] = 0.f; + + const int dist_x = -4 + cell_thread_x - 4 * cell_x; + const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); + + const int dist_y_begin = -4 - 4 * lidY; + for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) { - __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + cell_thread_x; - for (int bin_id = 0; bin_id < cnbins; ++bin_id) - hist[bin_id * 48] = 0.f; + float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); + uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); - const int dist_x = -4 + cell_thread_x - 4 * cell_x; + grad_ptr += grad_quadstep; + qangle_ptr += qangle_step; - const int dist_y_begin = -4 - 4 * lidY; - for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) - { - float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); - uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); + int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - grad_ptr += grad_quadstep; - qangle_ptr += qangle_step; + float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * + dist_center_x) * scale); + float interp_weight = (8.f - fabs(dist_y + 0.5f)) * + (8.f - fabs(dist_x + 0.5f)) / 64.f; - int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); - - float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale); - float interp_weight = (8.f - fabs(dist_y + 0.5f)) * (8.f - fabs(dist_x + 0.5f)) / 64.f; - - hist[bin.x * 48] += gaussian * interp_weight * vote.x; - hist[bin.y * 48] += gaussian * interp_weight * vote.y; - } - - volatile __local float* hist_ = hist; - for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) - { - if (cell_thread_x < 6) hist_[0] += hist_[6]; - if (cell_thread_x < 3) hist_[0] += hist_[3]; - if (cell_thread_x == 0) - final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2]; - } + hist[bin.x * 48] += gaussian * interp_weight * vote.x; + hist[bin.y * 48] += gaussian * interp_weight * vote.y; } - barrier(CLK_LOCAL_MEM_FENCE); - __global float* block_hist = block_hists + (gidY * img_block_width + gidX) * cblock_hist_size; + volatile __local float* hist_ = hist; + for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) + { + if (cell_thread_x < 6) + hist_[0] += hist_[6]; + barrier(CLK_LOCAL_MEM_FENCE); + if (cell_thread_x < 3) + hist_[0] += hist_[3]; +#ifdef WAVE_SIZE_1 + barrier(CLK_LOCAL_MEM_FENCE); +#endif + if (cell_thread_x == 0) + final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = + hist_[0] + hist_[1] + hist_[2]; + } +#ifdef WAVE_SIZE_1 + barrier(CLK_LOCAL_MEM_FENCE); +#endif - int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x; - if (tid < cblock_hist_size) + int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; + if ((tid < cblock_hist_size) && (gid < blocks_total)) + { + __global float* block_hist = block_hists + + (gidY * img_block_width + gidX) * cblock_hist_size; block_hist[tid] = final_hist[tid]; + } } //------------------------------------------------------------- @@ -133,21 +153,59 @@ float reduce_smem(volatile __local float* smem, int size) unsigned int tid = get_local_id(0); float sum = smem[tid]; - if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; barrier(CLK_LOCAL_MEM_FENCE); } - if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; barrier(CLK_LOCAL_MEM_FENCE); } - if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 512) + { + if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (size >= 256) + { + if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (size >= 128) + { + if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + } if (tid < 32) { if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; +#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1) } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { +#endif if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; } @@ -224,19 +282,44 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr if (tid < 64) products[tid] = product = product + products[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); + volatile __local float* smem = products; if (tid < 32) { - volatile __local float* smem = products; smem[tid] = product = product + smem[tid + 32]; +#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1) } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { - volatile __local float* smem = products; +#endif smem[tid] = product = product + smem[tid + 16]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif smem[tid] = product = product + smem[tid + 8]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif smem[tid] = product = product + smem[tid + 4]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif smem[tid] = product = product + smem[tid + 2]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif smem[tid] = product = product + smem[tid + 1]; } @@ -248,8 +331,8 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr // Extract descriptors __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width, - const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, - __global const float* block_hists, __global float* descriptors) + const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, + __global const float* block_hists, __global float* descriptors) { int tid = get_local_id(0); int gidX = get_group_id(0); @@ -271,8 +354,8 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in } __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, - const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, - const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) + const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, + const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) { int tid = get_local_id(0); int gidX = get_group_id(0); @@ -301,8 +384,8 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in // Gradients computation __kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - const __global uchar4 * img, __global float * grad, __global uchar * qangle, - const float angle_scale, const char correct_gamma, const int cnbins) + const __global uchar4 * img, __global float * grad, __global uchar * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); const int tid = get_local_id(0); @@ -400,8 +483,8 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c } __kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - __global const uchar * img, __global float * grad, __global uchar * qangle, - const float angle_scale, const char correct_gamma, const int cnbins) + __global const uchar * img, __global float * grad, __global uchar * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); const int tid = get_local_id(0); diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl index c772be78ac..1043b8410b 100644 --- a/modules/ocl/src/opencl/pyrlk.cl +++ b/modules/ocl/src/opencl/pyrlk.cl @@ -184,6 +184,209 @@ float linearFilter_float(__global const float* src, int srcStep, int cn, float2 } #define BUFFER 64 + +#ifdef CPU +void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + smem3[tid] = val3; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + smem3[tid] = val3 += smem3[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + smem3[tid] = val3 += smem3[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = val1 += smem1[tid + 32]; + smem2[tid] = val2 += smem2[tid + 32]; + smem3[tid] = val3 += smem3[tid + 32]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = val1 += smem1[tid + 16]; + smem2[tid] = val2 += smem2[tid + 16]; + smem3[tid] = val3 += smem3[tid + 16]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = val1 += smem1[tid + 8]; + smem2[tid] = val2 += smem2[tid + 8]; + smem3[tid] = val3 += smem3[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = val1 += smem1[tid + 4]; + smem2[tid] = val2 += smem2[tid + 4]; + smem3[tid] = val3 += smem3[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = val1 += smem1[tid + 2]; + smem2[tid] = val2 += smem2[tid + 2]; + smem3[tid] = val3 += smem3[tid + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = val1 += smem1[tid + 1]; + smem2[BUFFER] = val2 += smem2[tid + 1]; + smem3[BUFFER] = val3 += smem3[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = (val1 += smem1[tid + 128]); + smem2[tid] = (val2 += smem2[tid + 128]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = (val1 += smem1[tid + 64]); + smem2[tid] = (val2 += smem2[tid + 64]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = (val1 += smem1[tid + 32]); + smem2[tid] = (val2 += smem2[tid + 32]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = (val1 += smem1[tid + 16]); + smem2[tid] = (val2 += smem2[tid + 16]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = (val1 += smem1[tid + 8]); + smem2[tid] = (val2 += smem2[tid + 8]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = (val1 += smem1[tid + 4]); + smem2[tid] = (val2 += smem2[tid + 4]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = (val1 += smem1[tid + 2]); + smem2[tid] = (val2 += smem2[tid + 2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = (val1 += smem1[tid + 1]); + smem2[BUFFER] = (val2 += smem2[tid + 1]); + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +void reduce1(float val1, volatile __local float* smem1, int tid) +{ + smem1[tid] = val1; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = (val1 += smem1[tid + 128]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = (val1 += smem1[tid + 64]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = (val1 += smem1[tid + 32]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = (val1 += smem1[tid + 16]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = (val1 += smem1[tid + 8]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = (val1 += smem1[tid + 4]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = (val1 += smem1[tid + 2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = (val1 += smem1[tid + 1]); + } + barrier(CLK_LOCAL_MEM_FENCE); +} +#else void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) { smem1[tid] = val1; @@ -325,6 +528,7 @@ void reduce1(float val1, __local float* smem1, int tid) vmem1[tid] = val1 += vmem1[tid + 1]; } } +#endif #define SCALE (1.0f / (1 << 20)) #define THRESHOLD 0.01f @@ -411,14 +615,20 @@ void GetError4(image2d_t J, const float x, const float y, const float4* Pch, flo *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); } - +#define GRIDSIZE 3 __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { +#ifdef CPU + __local float smem1[BUFFER+1]; + __local float smem2[BUFFER+1]; + __local float smem3[BUFFER+1]; +#else __local float smem1[BUFFER]; __local float smem2[BUFFER]; __local float smem3[BUFFER]; +#endif unsigned int xid=get_local_id(0); unsigned int yid=get_local_id(1); @@ -431,7 +641,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, const int tid = mad24(yid, xsize, xid); - float2 prevPt = prevPts[gid] / (1 << level); + float2 prevPt = prevPts[gid] / (float2)(1 << level); if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) { @@ -450,9 +660,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, float A12 = 0; float A22 = 0; - float I_patch[3][3]; - float dIdx_patch[3][3]; - float dIdy_patch[3][3]; + float I_patch[GRIDSIZE][GRIDSIZE]; + float dIdx_patch[GRIDSIZE][GRIDSIZE]; + float dIdy_patch[GRIDSIZE][GRIDSIZE]; yBase=yid; { @@ -512,12 +722,19 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2], &A11, &A12, &A22); } + reduce3(A11, A12, A22, smem1, smem2, smem3, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + A11 = smem1[BUFFER]; + A12 = smem2[BUFFER]; + A22 = smem3[BUFFER]; +#else A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; +#endif float D = A11 * A22 - A12 * A12; @@ -609,8 +826,13 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, reduce2(b1, b2, smem1, smem2, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + b1 = smem1[BUFFER]; + b2 = smem2[BUFFER]; +#else b1 = smem1[0]; b2 = smem2[0]; +#endif float2 delta; delta.x = A12 * b2 - A22 * b1; @@ -685,18 +907,28 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, nextPts[gid] = prevPt; if (calcErr) - err[gid] = smem1[0] / (c_winSize_x * c_winSize_y); +#ifdef CPU + err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y); +#else + err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y); +#endif } - } + __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { - __local float smem1[BUFFER]; - __local float smem2[BUFFER]; - __local float smem3[BUFFER]; +#ifdef CPU + __local float smem1[BUFFER+1]; + __local float smem2[BUFFER+1]; + __local float smem3[BUFFER+1]; +#else + __local float smem1[BUFFER]; + __local float smem2[BUFFER]; + __local float smem3[BUFFER]; +#endif unsigned int xid=get_local_id(0); unsigned int yid=get_local_id(1); @@ -709,7 +941,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, const int tid = mad24(yid, xsize, xid); - float2 nextPt = prevPts[gid]/(1<= cols || nextPt.y < 0 || nextPt.y >= rows) { @@ -725,9 +957,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, // extract the patch from the first image, compute covariation matrix of derivatives - float A11 = 0; - float A12 = 0; - float A22 = 0; + float A11 = 0.0f; + float A12 = 0.0f; + float A22 = 0.0f; float4 I_patch[8]; float4 dIdx_patch[8]; @@ -797,9 +1029,15 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, reduce3(A11, A12, A22, smem1, smem2, smem3, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + A11 = smem1[BUFFER]; + A12 = smem2[BUFFER]; + A22 = smem3[BUFFER]; +#else A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; +#endif float D = A11 * A22 - A12 * A12; @@ -888,12 +1126,16 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, &b1, &b2); } - reduce2(b1, b2, smem1, smem2, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + b1 = smem1[BUFFER]; + b2 = smem2[BUFFER]; +#else b1 = smem1[0]; b2 = smem2[0]; +#endif float2 delta; delta.x = A12 * b2 - A22 * b1; @@ -967,7 +1209,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, nextPts[gid] = nextPt; if (calcErr) - err[gid] = smem1[0] / (3 * c_winSize_x * c_winSize_y); +#ifdef CPU + err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y); +#else + err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y); +#endif } } diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl index 99177c7bd0..196a786d5b 100644 --- a/modules/ocl/src/opencl/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -226,9 +226,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; - // int Y = get_group_id(1) * ROWSperTHREAD + radius; + // int Y = get_group_id(1) * ROWSperTHREAD + radius; - #define Y (get_group_id(1) * ROWSperTHREAD + radius) +#define Y (get_group_id(1) * ROWSperTHREAD + radius) volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; __global unsigned char* disparImage = disp + X + Y * disp_step; @@ -251,9 +251,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function + uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (X < cwidth - radius && Y < cheight - radius) { - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (minSSD.x < minSSDImage[0]) { disparImage[0] = (unsigned char)(d + minSSD.y); @@ -264,7 +264,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char for(int row = 1; row < end_row; row++) { int idx1 = y_tex * img_step + x_tex; - int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex; + int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex; barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE); @@ -278,10 +278,10 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char barrier(CLK_LOCAL_MEM_FENCE); + uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (X < cwidth - radius && row < cheight - radius - Y) { int idx = row * cminSSD_step; - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (minSSD.x < minSSDImage[idx]) { disparImage[disp_step * row] = (unsigned char)(d + minSSD.y); @@ -378,50 +378,50 @@ __kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, in int beg_row = group_id_y * RpT; int end_row = min(beg_row + RpT, disp_rows); - // if (x < disp_cols) - // { - int y = beg_row; +// if (x < disp_cols) +// { + int y = beg_row; - float sum = 0; - float sum_extra = 0; + float sum = 0; + float sum_extra = 0; - for(int i = y - winsz2; i <= y + winsz2; ++i) - { - sum += sobel(input, x - winsz2, i, input_rows, input_cols); - if (cols_extra) - sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols); - } - *cols = sum; + for(int i = y - winsz2; i <= y + winsz2; ++i) + { + sum += sobel(input, x - winsz2, i, input_rows, input_cols); if (cols_extra) + sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols); + } + *cols = sum; + if (cols_extra) + *cols_extra = sum_extra; + + barrier(CLK_LOCAL_MEM_FENCE); + + float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; + if (sum_win < threshold) + disp[y * disp_step + x] = 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + for(int y = beg_row + 1; y < end_row; ++y) + { + sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + + sobel(input, x - winsz2, y + winsz2, input_rows, input_cols); + *cols = sum; + + if (cols_extra) + { + sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) + + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols); *cols_extra = sum_extra; + } barrier(CLK_LOCAL_MEM_FENCE); - float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; if (sum_win < threshold) disp[y * disp_step + x] = 0; barrier(CLK_LOCAL_MEM_FENCE); - - for(int y = beg_row + 1; y < end_row; ++y) - { - sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + - sobel(input, x - winsz2, y + winsz2, input_rows, input_cols); - *cols = sum; - - if (cols_extra) - { - sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) - + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols); - *cols_extra = sum_extra; - } - - barrier(CLK_LOCAL_MEM_FENCE); - float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; - if (sum_win < threshold) - disp[y * disp_step + x] = 0; - - barrier(CLK_LOCAL_MEM_FENCE); - } - // } + } + // } } diff --git a/modules/ocl/test/main.cpp b/modules/ocl/test/main.cpp index d3273f7fe0..820d577552 100644 --- a/modules/ocl/test/main.cpp +++ b/modules/ocl/test/main.cpp @@ -115,10 +115,9 @@ int main(int argc, char **argv) std::cout << "platform invalid\n"; return -1; } - if(pid != 0 || device != 0) - { - setDevice(oclinfo[pid], device); - } + + setDevice(oclinfo[pid], device); + cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl; return RUN_ALL_TESTS(); } diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp index f643864a86..e46fdbddd1 100644 --- a/modules/ocl/test/test_arithm.cpp +++ b/modules/ocl/test/test_arithm.cpp @@ -1531,6 +1531,10 @@ INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); +INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine( + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), + Values(false))); + INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); // Values(false) is the reserved parameter @@ -1586,19 +1590,19 @@ INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC3, CV_32F INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine( - Values(CV_8UC1, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false))); diff --git a/modules/ocl/test/test_brute_force_matcher.cpp b/modules/ocl/test/test_brute_force_matcher.cpp index 424781fe0a..71b299eb65 100644 --- a/modules/ocl/test/test_brute_force_matcher.cpp +++ b/modules/ocl/test/test_brute_force_matcher.cpp @@ -43,16 +43,14 @@ #ifdef HAVE_OPENCL namespace { - ///////////////////////////////////////////////////////////////////////////////////////////////// // BruteForceMatcher - - CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist, cv::ocl::BruteForceMatcher_OCL_base::L2Dist, cv::ocl::BruteForceMatcher_OCL_base::HammingDist) + CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist,\ + cv::ocl::BruteForceMatcher_OCL_base::L2Dist,\ + cv::ocl::BruteForceMatcher_OCL_base::HammingDist) IMPLEMENT_PARAM_CLASS(DescriptorSize, int) - - PARAM_TEST_CASE(BruteForceMatcher/*, NormCode*/, DistType, DescriptorSize) + PARAM_TEST_CASE(BruteForceMatcher, DistType, DescriptorSize) { - //std::vector oclinfo; cv::ocl::BruteForceMatcher_OCL_base::DistType distType; int normCode; int dim; @@ -64,13 +62,9 @@ namespace virtual void SetUp() { - //normCode = GET_PARAM(0); distType = (cv::ocl::BruteForceMatcher_OCL_base::DistType)(int)GET_PARAM(0); dim = GET_PARAM(1); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - queryDescCount = 300; // must be even number because we split train data in some cases in two countFactor = 4; // do not change it @@ -172,49 +166,33 @@ namespace cv::ocl::BruteForceMatcher_OCL_base matcher(distType); - // assume support atomic. - //if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS)) - //{ - // try - // { - // std::vector< std::vector > matches; - // matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius); - // } - // catch (const cv::Exception& e) - // { - // ASSERT_EQ(CV_StsNotImplemented, e.code); - // } - //} - //else + std::vector< std::vector > matches; + matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius); + + ASSERT_EQ(static_cast(queryDescCount), matches.size()); + + int badCount = 0; + for (size_t i = 0; i < matches.size(); i++) { - std::vector< std::vector > matches; - matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius); - - ASSERT_EQ(static_cast(queryDescCount), matches.size()); - - int badCount = 0; - for (size_t i = 0; i < matches.size(); i++) + if ((int)matches[i].size() != 1) { - if ((int)matches[i].size() != 1) - { - badCount++; - } - else - { - cv::DMatch match = matches[i][0]; - if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0)) - badCount++; - } + badCount++; + } + else + { + cv::DMatch match = matches[i][0]; + if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0)) + badCount++; } - - ASSERT_EQ(0, badCount); } + + ASSERT_EQ(0, badCount); } - INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine( - //ALL_DEVICES, - testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)), - testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)))); + INSTANTIATE_TEST_CASE_P(OCL_Features2D, BruteForceMatcher, + testing::Combine( + testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)), + testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)))); } // namespace #endif diff --git a/modules/python/src2/cv.py b/modules/python/src2/cv.py old mode 100755 new mode 100644 index 4238d05f7a..2d4daf08b0 --- a/modules/python/src2/cv.py +++ b/modules/python/src2/cv.py @@ -1,3 +1 @@ -#/usr/bin/env python - from cv2.cv import * diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp index 13c30f5958..7d5fbaee96 100644 --- a/modules/python/src2/cv2.cpp +++ b/modules/python/src2/cv2.cpp @@ -396,7 +396,7 @@ static PyObject* pyopencv_from(const Mat& m) if(!p->refcount || p->allocator != &g_numpyAllocator) { temp.allocator = &g_numpyAllocator; - m.copyTo(temp); + ERRWRAP2(m.copyTo(temp)); p = &temp; } p->addref(); diff --git a/modules/python/src2/gen.py b/modules/python/src2/gen.py index 65cafc9900..40879e569f 100755 --- a/modules/python/src2/gen.py +++ b/modules/python/src2/gen.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys from string import Template diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py index 5f4f60fce6..d1b3358f67 100755 --- a/modules/python/src2/gen2.py +++ b/modules/python/src2/gen2.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import hdr_parser, sys, re, os, cStringIO from string import Template diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py index cf26b59ebe..b13fe8cf8d 100755 --- a/modules/python/src2/hdr_parser.py +++ b/modules/python/src2/hdr_parser.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, re, string diff --git a/modules/python/test/calchist.py b/modules/python/test/calchist.py index 0a52258b20..287e22f91e 100755 --- a/modules/python/test/calchist.py +++ b/modules/python/test/calchist.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # Calculating and displaying 2D Hue-Saturation histogram of a color image import sys diff --git a/modules/python/test/camera_calibration.py b/modules/python/test/camera_calibration.py index 488dd15c62..8ffc5b1cd9 100755 --- a/modules/python/test/camera_calibration.py +++ b/modules/python/test/camera_calibration.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys import math diff --git a/modules/python/test/findstereocorrespondence.py b/modules/python/test/findstereocorrespondence.py index 8f11738cce..40a9603beb 100755 --- a/modules/python/test/findstereocorrespondence.py +++ b/modules/python/test/findstereocorrespondence.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys import cv2.cv as cv diff --git a/modules/python/test/goodfeatures.py b/modules/python/test/goodfeatures.py index 62907772ab..5ccd5b46c1 100755 --- a/modules/python/test/goodfeatures.py +++ b/modules/python/test/goodfeatures.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import unittest diff --git a/modules/python/test/leak1.py b/modules/python/test/leak1.py index dde5608951..dbd6040a5a 100755 --- a/modules/python/test/leak1.py +++ b/modules/python/test/leak1.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import numpy as np diff --git a/modules/python/test/leak2.py b/modules/python/test/leak2.py index af1cb0556c..518226448a 100755 --- a/modules/python/test/leak2.py +++ b/modules/python/test/leak2.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import numpy as np diff --git a/modules/python/test/leak3.py b/modules/python/test/leak3.py index f72afbbf08..d763c4044d 100755 --- a/modules/python/test/leak3.py +++ b/modules/python/test/leak3.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import math diff --git a/modules/python/test/leak4.py b/modules/python/test/leak4.py index dcfc5cfdc9..9e5864092b 100755 --- a/modules/python/test/leak4.py +++ b/modules/python/test/leak4.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import math diff --git a/modules/python/test/precornerdetect.py b/modules/python/test/precornerdetect.py index 29a6ca1ecd..97aa906d4a 100755 --- a/modules/python/test/precornerdetect.py +++ b/modules/python/test/precornerdetect.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv diff --git a/modules/python/test/test.py b/modules/python/test/test.py index 33e605ec06..0886834f35 100755 --- a/modules/python/test/test.py +++ b/modules/python/test/test.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import unittest import random diff --git a/modules/python/test/test2.py b/modules/python/test/test2.py index 8796e70198..fc345f4b5b 100644 --- a/modules/python/test/test2.py +++ b/modules/python/test/test2.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import unittest import random diff --git a/modules/python/test/ticket_6.py b/modules/python/test/ticket_6.py index 533027f5b9..7249ff2c75 100755 --- a/modules/python/test/ticket_6.py +++ b/modules/python/test/ticket_6.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import urllib import cv2.cv as cv diff --git a/modules/python/test/tickets.py b/modules/python/test/tickets.py index 1e756bcd82..de51e7aa16 100755 --- a/modules/python/test/tickets.py +++ b/modules/python/test/tickets.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import unittest import random diff --git a/modules/python/test/transformations.py b/modules/python/test/transformations.py index 1f63bcef22..5dce6b0497 100755 --- a/modules/python/test/transformations.py +++ b/modules/python/test/transformations.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- # transformations.py diff --git a/modules/ts/misc/testlog_parser.py b/modules/ts/misc/testlog_parser.py index f61b47bba9..7ae6aa5980 100755 --- a/modules/ts/misc/testlog_parser.py +++ b/modules/ts/misc/testlog_parser.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, re, os.path from xml.dom.minidom import parse diff --git a/samples/android/15-puzzle/res/layout/activity_puzzle15.xml b/samples/android/15-puzzle/res/layout/activity_puzzle15.xml deleted file mode 100644 index 3257ed801c..0000000000 --- a/samples/android/15-puzzle/res/layout/activity_puzzle15.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - diff --git a/samples/android/15-puzzle/res/menu/activity_puzzle15.xml b/samples/android/15-puzzle/res/menu/activity_puzzle15.xml deleted file mode 100644 index 7810d81963..0000000000 --- a/samples/android/15-puzzle/res/menu/activity_puzzle15.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - diff --git a/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java b/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java index 466400d873..ebd34fc7e2 100644 --- a/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java +++ b/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java @@ -6,6 +6,7 @@ import org.opencv.android.OpenCVLoader; import org.opencv.core.Mat; import org.opencv.android.CameraBridgeViewBase; import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener; +import org.opencv.android.JavaCameraView; import android.os.Bundle; import android.app.Activity; @@ -22,6 +23,9 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener, private CameraBridgeViewBase mOpenCvCameraView; private Puzzle15Processor mPuzzle15; + private MenuItem mItemHideNumbers; + private MenuItem mItemStartNewGame; + private int mGameWidth; private int mGameHeight; @@ -52,9 +56,9 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener, super.onCreate(savedInstanceState); getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON); - setContentView(R.layout.activity_puzzle15); - - mOpenCvCameraView = (CameraBridgeViewBase) findViewById(R.id.puzzle_activity_surface_view); + Log.d(TAG, "Creating and seting view"); + mOpenCvCameraView = (CameraBridgeViewBase) new JavaCameraView(this, -1); + setContentView(mOpenCvCameraView); mOpenCvCameraView.setCvCameraViewListener(this); mPuzzle15 = new Puzzle15Processor(); mPuzzle15.prepareNewGame(); @@ -83,17 +87,19 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener, @Override public boolean onCreateOptionsMenu(Menu menu) { - getMenuInflater().inflate(R.menu.activity_puzzle15, menu); + Log.i(TAG, "called onCreateOptionsMenu"); + mItemHideNumbers = menu.add("Show/hide tile numbers"); + mItemStartNewGame = menu.add("Start new game"); return true; } @Override public boolean onOptionsItemSelected(MenuItem item) { Log.i(TAG, "Menu Item selected " + item); - if (item.getItemId() == R.id.menu_start_new_game) { + if (item == mItemStartNewGame) { /* We need to start new game */ mPuzzle15.prepareNewGame(); - } else if (item.getItemId() == R.id.menu_toggle_tile_numbers) { + } else if (item == mItemHideNumbers) { /* We need to enable or disable drawing of the tile numbers */ mPuzzle15.toggleTileNumbers(); } diff --git a/samples/android/native-activity/jni/Android.mk b/samples/android/native-activity/jni/Android.mk index 720d38b40d..fd4fd2bfbb 100644 --- a/samples/android/native-activity/jni/Android.mk +++ b/samples/android/native-activity/jni/Android.mk @@ -6,7 +6,7 @@ include ../../sdk/native/jni/OpenCV.mk LOCAL_MODULE := native_activity LOCAL_SRC_FILES := native.cpp -LOCAL_LDLIBS := -lm -llog -landroid +LOCAL_LDLIBS += -lm -llog -landroid LOCAL_STATIC_LIBRARIES := android_native_app_glue include $(BUILD_SHARED_LIBRARY) diff --git a/samples/android/native-activity/jni/Application.mk b/samples/android/native-activity/jni/Application.mk index a89e12df19..e9392cfed2 100644 --- a/samples/android/native-activity/jni/Application.mk +++ b/samples/android/native-activity/jni/Application.mk @@ -1,2 +1,4 @@ APP_ABI := armeabi-v7a +APP_STL := gnustl_static +APP_CPPFLAGS := -frtti -fexceptions APP_PLATFORM := android-9 diff --git a/samples/cpp/freak_demo.cpp b/samples/cpp/freak_demo.cpp index b420e455c5..82a435fcc4 100644 --- a/samples/cpp/freak_demo.cpp +++ b/samples/cpp/freak_demo.cpp @@ -73,7 +73,7 @@ int main( int argc, char** argv ) { } Mat imgB = imread(argv[2], IMREAD_GRAYSCALE ); - if( !imgA.data ) { + if( !imgB.data ) { std::cout << " --(!) Error reading image " << argv[2] << std::endl; return -1; } diff --git a/samples/cpp/lkdemo.cpp b/samples/cpp/lkdemo.cpp index 3913e2c2db..c665cfdfe2 100644 --- a/samples/cpp/lkdemo.cpp +++ b/samples/cpp/lkdemo.cpp @@ -12,9 +12,8 @@ static void help() { // print a welcome message, and the OpenCV version cout << "\nThis is a demo of Lukas-Kanade optical flow lkdemo(),\n" - "Using OpenCV version %s\n" << CV_VERSION << "\n" - << endl; - + "Using OpenCV version " << CV_VERSION << endl; + cout << "\nIt uses camera by default, but you can provide a path to video as an argument.\n"; cout << "\nHot keys: \n" "\tESC - quit the program\n" "\tr - auto-initialize tracking\n" @@ -30,13 +29,15 @@ static void onMouse( int event, int x, int y, int /*flags*/, void* /*param*/ ) { if( event == EVENT_LBUTTONDOWN ) { - point = Point2f((float)x,(float)y); + point = Point2f((float)x, (float)y); addRemovePt = true; } } int main( int argc, char** argv ) { + help(); + VideoCapture cap; TermCriteria termcrit(TermCriteria::COUNT|TermCriteria::EPS,20,0.03); Size subPixWinSize(10,10), winSize(31,31); @@ -56,8 +57,6 @@ int main( int argc, char** argv ) return 0; } - help(); - namedWindow( "LK Demo", 1 ); setMouseCallback( "LK Demo", onMouse, 0 ); @@ -134,17 +133,16 @@ int main( int argc, char** argv ) needToInit = true; break; case 'c': + points[0].clear(); points[1].clear(); break; case 'n': nightMode = !nightMode; break; - default: - ; } std::swap(points[1], points[0]); - swap(prevGray, gray); + cv::swap(prevGray, gray); } return 0; diff --git a/samples/python2/common.py b/samples/python2/common.py index 5996695943..3988fe2a9e 100755 --- a/samples/python2/common.py +++ b/samples/python2/common.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + ''' This module contains some common routines used by other samples. ''' diff --git a/samples/python2/dft.py b/samples/python2/dft.py index 9aac53a884..0a5ca650c1 100644 --- a/samples/python2/dft.py +++ b/samples/python2/dft.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2 import numpy as np diff --git a/samples/python2/watershed.py b/samples/python2/watershed.py index 037af1ec45..c6247da22c 100755 --- a/samples/python2/watershed.py +++ b/samples/python2/watershed.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Watershed segmentation