From e1f43e3b0362320790b85c856cd86c5de511067d Mon Sep 17 00:00:00 2001 From: peng xiao Date: Thu, 18 Jul 2013 17:25:00 +0800 Subject: [PATCH] Add sort_by_key for oclMat. Most codes are ported from AMD's Bolt library. Four methods are implemented: SORT_BITONIC, // only support power-of-2 buffer size SORT_SELECTION, // cannot sort duplicate keys SORT_MERGE, SORT_RADIX // only support signed int/float keys --- modules/ocl/include/opencv2/ocl/ocl.hpp | 27 ++ .../src/opencl/kernel_radix_sort_by_key.cl | 176 +++++++ modules/ocl/src/opencl/kernel_sort_by_key.cl | 245 ++++++++++ .../src/opencl/kernel_stablesort_by_key.cl | 296 ++++++++++++ modules/ocl/src/sort_by_key.cpp | 453 ++++++++++++++++++ modules/ocl/test/test_sort.cpp | 244 ++++++++++ 6 files changed, 1441 insertions(+) create mode 100644 modules/ocl/src/opencl/kernel_radix_sort_by_key.cl create mode 100644 modules/ocl/src/opencl/kernel_sort_by_key.cl create mode 100644 modules/ocl/src/opencl/kernel_stablesort_by_key.cl create mode 100644 modules/ocl/src/sort_by_key.cpp create mode 100644 modules/ocl/test/test_sort.cpp diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index e7b133e672..1674c86280 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -1673,6 +1673,33 @@ namespace cv oclMat diff_buf; oclMat norm_buf; }; + // current supported sorting methods + enum + { + SORT_BITONIC, // only support power-of-2 buffer size + SORT_SELECTION, // cannot sort duplicate keys + SORT_MERGE, + SORT_RADIX // only support signed int/float keys + }; + //! Returns the sorted result of all the elements in input based on equivalent keys. + // + // The element unit in the values to be sorted is determined from the data type, + // i.e., a CV_32FC2 input {a1a2, b1b2} will be considered as two elements, regardless its + // matrix dimension. + // both keys and values will be sorted inplace + // Key needs to be single channel oclMat. + // TODO(pengx): add supported types for values + // + // Example: + // input - + // keys = {2, 3, 1} (CV_8UC1) + // values = {10,5, 4,3, 6,2} (CV_8UC2) + // sort_by_key(keys, values, SORT_SELECTION, false); + // output - + // keys = {1, 2, 3} (CV_8UC1) + // values = {6,2, 10,5, 4,3} (CV_8UC2) + void CV_EXPORTS sort_by_key(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false); + void CV_EXPORTS sort_by_key(oclMat& keys, oclMat& values, size_t vecSize, int method, bool isGreaterThan = false); } } #if defined _MSC_VER && _MSC_VER >= 1200 diff --git a/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl b/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl new file mode 100644 index 0000000000..fdb440aeea --- /dev/null +++ b/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl @@ -0,0 +1,176 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable + +#ifndef N // number of radices +#define N 4 +#endif + +#ifndef K_T +#define K_T float +#endif + +#ifndef V_T +#define V_T float +#endif + +#ifndef IS_GT +#define IS_GT 0 +#endif + + +// from Thrust::b40c, link: +// https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h +__inline uint convertKey(uint converted_key) +{ +#ifdef K_FLT + unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000; + converted_key ^= mask; +#elif defined(K_INT) + const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1); + converted_key ^= SIGN_MASK; +#else + +#endif + return converted_key; +} + +//FIXME(pengx17): +// exclusive scan, need to be optimized as this is too naive... +kernel + void naiveScanAddition( + __global int * input, + __global int * output, + int size + ) +{ + if(get_global_id(0) == 0) + { + output[0] = 0; + for(int i = 1; i < size; i ++) + { + output[i] = output[i - 1] + input[i - 1]; + } + } +} + +// following is ported from +// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl +kernel + void histogramRadixN ( + __global K_T* unsortedKeys, + __global int * buckets, + uint shiftCount + ) +{ + const int RADIX_T = N; + const int RADICES_T = (1 << RADIX_T); + const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T; + const int MASK_T = (1 << RADIX_T) - 1; + int localBuckets[16] = {0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0}; + int globalId = get_global_id(0); + int numOfGroups = get_num_groups(0); + + /* Calculate thread-histograms */ + for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i) + { + uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)])); + value = (value >> shiftCount) & MASK_T; +#if IS_GT + localBuckets[RADICES_T - value - 1]++; +#else + localBuckets[value]++; +#endif + } + + for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i) + { + buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i]; + } +} + +kernel + void permuteRadixN ( + __global K_T* unsortedKeys, + __global V_T* unsortedVals, + __global int* scanedBuckets, + uint shiftCount, + __global K_T* sortedKeys, + __global V_T* sortedVals + ) +{ + const int RADIX_T = N; + const int RADICES_T = (1 << RADIX_T); + const int MASK_T = (1<> shiftCount) & MASK_T; + uint index = localIndex[maskedValue]; + sortedKeys[index] = ovalue; + sortedVals[index] = unsortedVals[old_idx]; + localIndex[maskedValue] = index + 1; + } +} diff --git a/modules/ocl/src/opencl/kernel_sort_by_key.cl b/modules/ocl/src/opencl/kernel_sort_by_key.cl new file mode 100644 index 0000000000..18e9d419aa --- /dev/null +++ b/modules/ocl/src/opencl/kernel_sort_by_key.cl @@ -0,0 +1,245 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef K_T +#define K_T float +#endif + +#ifndef V_T +#define V_T float +#endif + +#ifndef IS_GT +#define IS_GT false +#endif + +#if IS_GT +#define my_comp(x,y) ((x) > (y)) +#else +#define my_comp(x,y) ((x) < (y)) +#endif + +/////////////////////// Bitonic sort //////////////////////////// +// ported from +// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl +__kernel + void bitonicSort + ( + __global K_T * keys, + __global V_T * vals, + int count, + int stage, + int passOfStage + ) +{ + const int threadId = get_global_id(0); + if(threadId >= count / 2) + { + return; + } + const int pairDistance = 1 << (stage - passOfStage); + const int blockWidth = 2 * pairDistance; + + int leftId = min( (threadId % pairDistance) + + (threadId / pairDistance) * blockWidth, count ); + + int rightId = min( leftId + pairDistance, count ); + + int temp; + + const V_T lval = vals[leftId]; + const V_T rval = vals[rightId]; + + const K_T lkey = keys[leftId]; + const K_T rkey = keys[rightId]; + + int sameDirectionBlockWidth = 1 << stage; + + if((threadId/sameDirectionBlockWidth) % 2 == 1) + { + temp = rightId; + rightId = leftId; + leftId = temp; + } + + const bool compareResult = my_comp(lkey, rkey); + + if(compareResult) + { + keys[rightId] = rkey; + keys[leftId] = lkey; + vals[rightId] = rval; + vals[leftId] = lval; + } + else + { + keys[rightId] = lkey; + keys[leftId] = rkey; + vals[rightId] = lval; + vals[leftId] = rval; + } +} + +/////////////////////// Selection sort //////////////////////////// +//kernel is ported from Bolt library: +//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl +__kernel + void selectionSortLocal + ( + __global K_T * keys, + __global V_T * vals, + const int count, + __local K_T * scratch + ) +{ + int i = get_local_id(0); // index in workgroup + int numOfGroups = get_num_groups(0); // index in workgroup + int groupID = get_group_id(0); + int wg = get_local_size(0); // workgroup size = block size + int n; // number of elements to be processed for this work group + + int offset = groupID * wg; + int same = 0; + + vals += offset; + keys += offset; + n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; + + int clamped_i= min(i, n - 1); + + K_T key1 = keys[clamped_i], key2; + V_T val1 = vals[clamped_i]; + scratch[i] = key1; + barrier(CLK_LOCAL_MEM_FENCE); + + if(i >= n) + { + return; + } + + int pos = 0; + for (int j=0;j= count) + return; + V_T val1 = vals[offset + i]; + + K_T key1 = keys[offset + i]; + K_T key2; + + for(int j=0; j (y)) +#else +#define my_comp(x,y) ((x) < (y)) +#endif + +///////////// parallel merge sort /////////////// +// ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl +uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal) +{ + // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence + uint firstIndex = left; + uint lastIndex = right; + + // This loops through [firstIndex, lastIndex) + // Since firstIndex and lastIndex will be different for every thread depending on the nested branch, + // this while loop will be divergent within a wavefront + while( firstIndex < lastIndex ) + { + K_T dataVal = data[ firstIndex ]; + + // This branch will create divergent wavefronts + if( my_comp( dataVal, searchVal ) ) + { + firstIndex = firstIndex+1; + } + else + { + break; + } + } + + return firstIndex; +} + +// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted +// by a base pointer and left and right index for a particular candidate value. The comparison operator is +// passed as a functor parameter my_comp +// This function returns an index that is the first index whos value would be equal to the searched value +uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) +{ + // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence + uint firstIndex = left; + uint lastIndex = right; + + // This loops through [firstIndex, lastIndex) + // Since firstIndex and lastIndex will be different for every thread depending on the nested branch, + // this while loop will be divergent within a wavefront + while( firstIndex < lastIndex ) + { + // midIndex is the average of first and last, rounded down + uint midIndex = ( firstIndex + lastIndex ) / 2; + K_T midValue = data[ midIndex ]; + + // This branch will create divergent wavefronts + if( my_comp( midValue, searchVal ) ) + { + firstIndex = midIndex+1; + // printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex ); + } + else + { + lastIndex = midIndex; + // printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex ); + } + } + + return firstIndex; +} + +// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted +// by a base pointer and left and right index for a particular candidate value. The comparison operator is +// passed as a functor parameter my_comp +// This function returns an index that is the first index whos value would be greater than the searched value +// If the search value is not found in the sequence, upperbound returns the same result as lowerbound +uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal) +{ + uint upperBound = lowerBoundBinary( data, left, right, searchVal ); + + // printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound ); + // If upperBound == right, then searchVal was not found in the sequence. Just return. + if( upperBound != right ) + { + // While the values are equal i.e. !(x < y) && !(y < x) increment the index + K_T upperValue = data[ upperBound ]; + while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) ) + { + upperBound++; + upperValue = data[ upperBound ]; + } + } + + return upperBound; +} + +// This kernel implements merging of blocks of sorted data. The input to this kernel most likely is +// the output of blockInsertionSortTemplate. It is expected that the source array contains multiple +// blocks, each block is independently sorted. The goal is to write into the output buffer half as +// many blocks, of double the size. The even and odd blocks are stably merged together to form +// a new sorted block of twice the size. The algorithm is out-of-place. +kernel void merge( + global K_T* iKey_ptr, + global V_T* iValue_ptr, + global K_T* oKey_ptr, + global V_T* oValue_ptr, + const uint srcVecSize, + const uint srcLogicalBlockSize, + local K_T* key_lds, + local V_T* val_lds +) +{ + size_t globalID = get_global_id( 0 ); + size_t groupID = get_group_id( 0 ); + size_t localID = get_local_id( 0 ); + size_t wgSize = get_local_size( 0 ); + + // Abort threads that are passed the end of the input vector + if( globalID >= srcVecSize ) + return; // on SI this doesn't mess-up barriers + + // For an element in sequence A, find the lowerbound index for it in sequence B + uint srcBlockNum = globalID / srcLogicalBlockSize; + uint srcBlockIndex = globalID % srcLogicalBlockSize; + + // printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex ); + + // Pairs of even-odd blocks will be merged together + // An even block should search for an insertion point in the next odd block, + // and the odd block should look for an insertion point in the corresponding previous even block + uint dstLogicalBlockSize = srcLogicalBlockSize<<1; + uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 ); + leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize; + leftBlockIndex = min( leftBlockIndex, srcVecSize ); + uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize ); + + // if( localID == 0 ) + // { + // printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex ); + // } + + // For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex + // uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex; + uint insertionIndex = 0; + if( (srcBlockNum & 0x1) == 0 ) + { + insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex; + } + else + { + insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex; + } + + // The index of an element in the result sequence is the summation of it's indixes in the two input + // sequences + uint dstBlockIndex = srcBlockIndex + insertionIndex; + uint dstBlockNum = srcBlockNum/2; + + // if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 ) + // { + // printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex ); + // printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] ); + // } + oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ]; + oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ]; + // printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex ); +} + +kernel void blockInsertionSort( + global K_T* key_ptr, + global V_T* value_ptr, + const uint vecSize, + local K_T* key_lds, + local V_T* val_lds +) +{ + size_t gloId = get_global_id( 0 ); + size_t groId = get_group_id( 0 ); + size_t locId = get_local_id( 0 ); + size_t wgSize = get_local_size( 0 ); + + bool in_range = gloId < vecSize; + K_T key; + V_T val; + // Abort threads that are passed the end of the input vector + if (in_range) + { + // Make a copy of the entire input array into fast local memory + key = key_ptr[ gloId ]; + val = value_ptr[ gloId ]; + key_lds[ locId ] = key; + val_lds[ locId ] = val; + } + barrier( CLK_LOCAL_MEM_FENCE ); + // Sorts a workgroup using a naive insertion sort + // The sort uses one thread within a workgroup to sort the entire workgroup + if( locId == 0 && in_range ) + { + // The last workgroup may have an irregular size, so we calculate a per-block endIndex + // endIndex is essentially emulating a mod operator with subtraction and multiply + size_t endIndex = vecSize - ( groId * wgSize ); + endIndex = min( endIndex, wgSize ); + + // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex ); + + // Indices are signed because the while loop will generate a -1 index inside of the max function + for( int currIndex = 1; currIndex < endIndex; ++currIndex ) + { + key = key_lds[ currIndex ]; + val = val_lds[ currIndex ]; + int scanIndex = currIndex; + K_T ldsKey = key_lds[scanIndex - 1]; + while( scanIndex > 0 && my_comp( key, ldsKey ) ) + { + V_T ldsVal = val_lds[scanIndex - 1]; + + // If the keys are being swapped, make sure the values are swapped identicaly + key_lds[ scanIndex ] = ldsKey; + val_lds[ scanIndex ] = ldsVal; + + scanIndex = scanIndex - 1; + ldsKey = key_lds[ max( 0, scanIndex - 1 ) ]; // scanIndex-1 may be -1 + } + key_lds[ scanIndex ] = key; + val_lds[ scanIndex ] = val; + } + } + barrier( CLK_LOCAL_MEM_FENCE ); + + if(in_range) + { + key = key_lds[ locId ]; + key_ptr[ gloId ] = key; + + val = val_lds[ locId ]; + value_ptr[ gloId ] = val; + } +} + +///////////// Radix sort from b40c library ///////////// diff --git a/modules/ocl/src/sort_by_key.cpp b/modules/ocl/src/sort_by_key.cpp new file mode 100644 index 0000000000..20f76f2835 --- /dev/null +++ b/modules/ocl/src/sort_by_key.cpp @@ -0,0 +1,453 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include +#include "precomp.hpp" + +namespace cv +{ +namespace ocl +{ + +extern const char * kernel_sort_by_key; +extern const char * kernel_stablesort_by_key; +extern const char * kernel_radix_sort_by_key; + +//TODO(pengx17): change this value depending on device other than a constant +const static unsigned int GROUP_SIZE = 256; + +const char * depth_strings[] = +{ + "uchar", //CV_8U + "char", //CV_8S + "ushort", //CV_16U + "short", //CV_16S + "int", //CV_32S + "float", //CV_32F + "double" //CV_64F +}; + +void genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf) +{ + sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s", + isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]); + if(vals.oclchannels() > 1) + { + sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels(), 10); + } +} +inline bool isSizePowerOf2(size_t size) +{ + return ((size - 1) & (size)) == 0; +} + +namespace bitonic_sort +{ +static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) +{ + CV_Assert(isSizePowerOf2(vecSize)); + + Context * cxt = Context::getContext(); + size_t globalThreads[3] = {vecSize / 2, 1, 1}; + size_t localThreads[3] = {GROUP_SIZE, 1, 1}; + + // 2^numStages should be equal to vecSize or the output is invalid + int numStages = 0; + for(int i = vecSize; i > 1; i >>= 1) + { + ++numStages; + } + char build_opt_buf [100]; + genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); + const int argc = 5; + std::vector< std::pair > args(argc); + String kernelname = "bitonicSort"; + + args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); + args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); + args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize); + + for(int stage = 0; stage < numStages; ++stage) + { + args[3] = std::make_pair(sizeof(cl_int), (void *)&stage); + for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) + { + args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); + openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); + } + } +} +} /* bitonic_sort */ + +namespace selection_sort +{ +// FIXME: +// This function cannot sort arrays with duplicated keys +static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) +{ + CV_Error(-1, "This function is incorrect at the moment."); + Context * cxt = Context::getContext(); + + size_t globalThreads[3] = {vecSize, 1, 1}; + size_t localThreads[3] = {GROUP_SIZE, 1, 1}; + + std::vector< std::pair > args; + char build_opt_buf [100]; + genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); + + //local + String kernelname = "selectionSortLocal"; + int lds_size = GROUP_SIZE * keys.elemSize(); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize)); + args.push_back(std::make_pair(lds_size, (void*)NULL)); + + openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); + + //final + kernelname = "selectionSortFinal"; + args.pop_back(); + openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf); +} + +} /* selection_sort */ + + +namespace radix_sort +{ +//FIXME(pengx17): +// exclusive scan, need to be optimized as this is too naive... +//void naive_scan_addition(oclMat& input, oclMat& output) +//{ +// Context * cxt = Context::getContext(); +// size_t vecSize = input.cols; +// size_t globalThreads[3] = {1, 1, 1}; +// size_t localThreads[3] = {1, 1, 1}; +// +// String kernelname = "naiveScanAddition"; +// +// std::vector< std::pair > args; +// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data)); +// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data)); +// args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize)); +// openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1); +//} + +void naive_scan_addition_cpu(oclMat& input, oclMat& output) +{ + Mat m_input = input, m_output(output.size(), output.type()); + MatIterator_ i_mit = m_input.begin(); + MatIterator_ o_mit = m_output.begin(); + *o_mit = 0; + ++i_mit; + ++o_mit; + for(; i_mit != m_input.end(); ++i_mit, ++o_mit) + { + *o_mit = *(o_mit - 1) + *(i_mit - 1); + } + output = m_output; +} + + +//radix sort ported from Bolt +static void sort_by_key(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan) +{ + CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes + + bool isKeyFloat = keys.type() == CV_32F; + + const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a + //local array of 16 elements in the histogram kernel. + const int RADICES = (1 << RADIX); //Values handeled by each work-item? + + bool newBuffer = false; + size_t vecSize = origVecSize; + + unsigned int groupSize = RADICES; + + size_t mulFactor = groupSize * RADICES; + + oclMat buffer_keys, buffer_vals; + + if(origVecSize % mulFactor != 0) + { + vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor; + buffer_keys.create(1, vecSize, keys.type()); + buffer_vals.create(1, vecSize, vals.type()); + Scalar padding_value; + oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1)); + + if(isGreaterThan) + { + switch(buffer_keys.depth()) + { + case CV_32F: + padding_value = Scalar::all(-FLT_MAX); + break; + case CV_32S: + padding_value = Scalar::all(INT_MIN); + break; + } + } + else + { + switch(buffer_keys.depth()) + { + case CV_32F: + padding_value = Scalar::all(FLT_MAX); + break; + case CV_32S: + padding_value = Scalar::all(INT_MAX); + break; + } + } + ocl::copyMakeBorder( + keys(Rect(0,0,origVecSize,1)), buffer_keys, + 0, 0, 0, vecSize - origVecSize, + BORDER_CONSTANT, padding_value); + vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals); + newBuffer = true; + } + else + { + buffer_keys = keys; + buffer_vals = vals; + newBuffer = false; + } + oclMat swap_input_keys(1, vecSize, keys.type()); + oclMat swap_input_vals(1, vecSize, vals.type()); + oclMat hist_bin_keys(1, vecSize, CV_32SC1); + oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1); + + Context * cxt = Context::getContext(); + + size_t globalThreads[3] = {vecSize / RADICES, 1, 1}; + size_t localThreads[3] = {groupSize, 1, 1}; + + std::vector< std::pair > args; + char build_opt_buf [100]; + genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); + + //additional build option for radix sort + sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT"); + + String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")}; + + int swap = 0; + for(int bits = 0; bits < (keys.elemSize() * 8); bits += RADIX) + { + args.clear(); + //Do a histogram pass locally + if(swap == 0) + { + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); + } + else + { + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); + } + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits)); + openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads, + args, -1, -1, build_opt_buf); + + args.clear(); + //Perform a global scan + naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys); + // end of scan + if(swap == 0) + { + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data)); + } + else + { + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data)); + } + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits)); + + if(swap == 0) + { + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data)); + } + else + { + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data)); + } + openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads, + args, -1, -1, build_opt_buf); + swap = swap ? 0 : 1; + } + if(newBuffer) + { + buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys); + buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals); + } +} + +} /* radix_sort */ + +namespace merge_sort +{ +static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan) +{ + Context * cxt = Context::getContext(); + + size_t globalThreads[3] = {vecSize, 1, 1}; + size_t localThreads[3] = {GROUP_SIZE, 1, 1}; + + std::vector< std::pair > args; + char build_opt_buf [100]; + genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf); + + String kernelname[] = {String("blockInsertionSort"), String("merge")}; + int keylds_size = GROUP_SIZE * keys.elemSize(); + int vallds_size = GROUP_SIZE * vals.elemSize(); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data)); + args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize)); + args.push_back(std::make_pair(keylds_size, (void*)NULL)); + args.push_back(std::make_pair(vallds_size, (void*)NULL)); + + openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf); + + // Early exit for the case of no merge passes, values are already in destination vector + if(vecSize <= GROUP_SIZE) + { + return; + } + + // An odd number of elements requires an extra merge pass to sort + size_t numMerges = 0; + // Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64 + // this is how many merge passes we want + size_t log2BlockSize = vecSize >> 6; + for( ; log2BlockSize > 1; log2BlockSize >>= 1 ) + { + ++numMerges; + } + // Check to see if the input vector size is a power of 2, if not we will need last merge pass + numMerges += isSizePowerOf2(vecSize)? 1: 0; + + // Allocate a flipflop buffer because the merge passes are out of place + oclMat tmpKeyBuffer(keys.size(), keys.type()); + oclMat tmpValBuffer(vals.size(), vals.type()); + args.resize(8); + + args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize); + args[6] = std::make_pair(keylds_size, (void*)NULL); + args[7] = std::make_pair(vallds_size, (void*)NULL); + + for(size_t pass = 1; pass <= numMerges; ++pass ) + { + // For each pass, flip the input-output buffers + if( pass & 0x1 ) + { + args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); + args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); + args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data); + args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data); + } + else + { + args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data); + args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data); + args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data); + args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data); + } + // For each pass, the merge window doubles + unsigned int srcLogicalBlockSize = static_cast( localThreads[0] << (pass-1) ); + args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize); + openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf); + } + // If there are an odd number of merges, then the output data is sitting in the temp buffer. We need to copy + // the results back into the input array + if( numMerges & 1 ) + { + tmpKeyBuffer.copyTo(keys); + tmpValBuffer.copyTo(vals); + } +} +} /* merge_sort */ + +} +} /* namespace cv { namespace ocl */ + + +void cv::ocl::sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan) +{ + CV_Assert( keys.rows == 1 ); // we only allow one dimensional input + CV_Assert( keys.channels() == 1 ); // we only allow one channel keys + CV_Assert( vecSize <= static_cast(keys.cols) ); + switch(method) + { + case SORT_BITONIC: + bitonic_sort::sort_by_key(keys, vals, vecSize, isGreaterThan); + break; + case SORT_SELECTION: + selection_sort::sort_by_key(keys, vals, vecSize, isGreaterThan); + break; + case SORT_RADIX: + radix_sort::sort_by_key(keys, vals, vecSize, isGreaterThan); + break; + case SORT_MERGE: + merge_sort::sort_by_key(keys, vals, vecSize, isGreaterThan); + break; + } +} + + +void cv::ocl::sort_by_key(oclMat& keys, oclMat& vals, int method, bool isGreaterThan) +{ + CV_Assert( keys.size() == vals.size() ); + CV_Assert( keys.rows == 1 ); // we only allow one dimensional input + size_t vecSize = static_cast(keys.cols); + sort_by_key(keys, vals, vecSize, method, isGreaterThan); +} diff --git a/modules/ocl/test/test_sort.cpp b/modules/ocl/test/test_sort.cpp new file mode 100644 index 0000000000..250628a58e --- /dev/null +++ b/modules/ocl/test/test_sort.cpp @@ -0,0 +1,244 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ +#include +#include +#include "precomp.hpp" + +using namespace std; +using namespace cvtest; +using namespace testing; +using namespace cv; + + +namespace +{ +IMPLEMENT_PARAM_CLASS(IsGreaterThan, bool) +IMPLEMENT_PARAM_CLASS(InputSize, int) +IMPLEMENT_PARAM_CLASS(SortMethod, int) + + +template +struct KV_CVTYPE{ static int toType() {return 0;} }; + +template<> struct KV_CVTYPE { static int toType() {return CV_32SC1;} }; +template<> struct KV_CVTYPE{ static int toType() {return CV_32FC1;} }; +template<> struct KV_CVTYPE{ static int toType() {return CV_32SC2;} }; +template<> struct KV_CVTYPE{ static int toType() {return CV_32FC2;} }; + +template +bool kvgreater(pair p1, pair p2) +{ + return p1.first > p2.first; +} + +template +bool kvless(pair p1, pair p2) +{ + return p1.first < p2.first; +} + +template +void toKVPair( + MatConstIterator_ kit, + MatConstIterator_ vit, + int vecSize, + vector >& kvres + ) +{ + kvres.clear(); + for(int i = 0; i < vecSize; i ++) + { + kvres.push_back(make_pair(*kit, *vit)); + ++kit; + ++vit; + } +} + +template +void kvquicksort(Mat& keys, Mat& vals, bool isGreater = false) +{ + vector > kvres; + toKVPair(keys.begin(), vals.begin(), keys.cols, kvres); + + if(isGreater) + { + std::sort(kvres.begin(), kvres.end(), kvgreater); + } + else + { + std::sort(kvres.begin(), kvres.end(), kvless); + } + key_type * kptr = keys.ptr(); + val_type * vptr = vals.ptr(); + for(int i = 0; i < keys.cols; i ++) + { + kptr[i] = kvres[i].first; + vptr[i] = kvres[i].second; + } +} + +class SortByKey_STL +{ +public: + static void sort(cv::Mat&, cv::Mat&, bool is_gt); +private: + typedef void (*quick_sorter)(cv::Mat&, cv::Mat&, bool); + SortByKey_STL(); + quick_sorter quick_sorters[CV_64FC4][CV_64FC4]; + static SortByKey_STL instance; +}; + +SortByKey_STL SortByKey_STL::instance = SortByKey_STL(); + +SortByKey_STL::SortByKey_STL() +{ + memset(instance.quick_sorters, 0, sizeof(quick_sorters)); +#define NEW_SORTER(KT, VT) \ + instance.quick_sorters[KV_CVTYPE::toType()][KV_CVTYPE::toType()] = kvquicksort; + + NEW_SORTER(int, int); + NEW_SORTER(int, Vec2i); + NEW_SORTER(int, float); + NEW_SORTER(int, Vec2f); + + NEW_SORTER(float, int); + NEW_SORTER(float, Vec2i); + NEW_SORTER(float, float); + NEW_SORTER(float, Vec2f); +#undef NEW_SORTER +} + +void SortByKey_STL::sort(cv::Mat& keys, cv::Mat& vals, bool is_gt) +{ + instance.quick_sorters[keys.type()][vals.type()](keys, vals, is_gt); +} + +bool checkUnstableSorterResult(const Mat& gkeys_, const Mat& gvals_, + const Mat& /*dkeys_*/, const Mat& dvals_) +{ + int cn_val = gvals_.channels(); + int count = gkeys_.cols; + + //for convenience we convert depth to float and channels to 1 + Mat gkeys, gvals, dkeys, dvals; + gkeys_.reshape(1).convertTo(gkeys, CV_32F); + gvals_.reshape(1).convertTo(gvals, CV_32F); + //dkeys_.reshape(1).convertTo(dkeys, CV_32F); + dvals_.reshape(1).convertTo(dvals, CV_32F); + float * gkptr = gkeys.ptr(); + float * gvptr = gvals.ptr(); + //float * dkptr = dkeys.ptr(); + float * dvptr = dvals.ptr(); + + for(int i = 0; i < count - 1; ++i) + { + int iden_count = 0; + // firstly calculate the number of identical keys + while(gkptr[i + iden_count] == gkptr[i + 1 + iden_count]) + { + ++ iden_count; + } + + // sort dv and gv + int num_of_val = (iden_count + 1) * cn_val; + std::sort(gvptr + i * cn_val, gvptr + i * cn_val + num_of_val); + std::sort(dvptr + i * cn_val, dvptr + i * cn_val + num_of_val); + + // then check if [i, i + iden_count) is the same + for(int j = 0; j < num_of_val; ++j) + { + if(gvptr[i + j] != dvptr[i + j]) + { + return false; + } + } + i += iden_count; + } + return true; +} +} + +#define INPUT_SIZES Values(InputSize(0x10), InputSize(0x100), InputSize(0x10000)) //2^4, 2^8, 2^16 +#define KEY_TYPES Values(MatType(CV_32SC1), MatType(CV_32FC1)) +#define VAL_TYPES Values(MatType(CV_32SC1), MatType(CV_32SC2), MatType(CV_32FC1), MatType(CV_32FC2)) +#define SORT_METHODS Values(SortMethod(cv::ocl::SORT_BITONIC),SortMethod(cv::ocl::SORT_MERGE),SortMethod(cv::ocl::SORT_RADIX)/*,SortMethod(cv::ocl::SORT_SELECTION)*/) +#define F_OR_T Values(IsGreaterThan(false), IsGreaterThan(true)) + +PARAM_TEST_CASE(SortByKey, InputSize, MatType, MatType, SortMethod, IsGreaterThan) +{ + InputSize input_size; + MatType key_type, val_type; + SortMethod method; + IsGreaterThan is_gt; + + Mat mat_key, mat_val; + virtual void SetUp() + { + input_size = GET_PARAM(0); + key_type = GET_PARAM(1); + val_type = GET_PARAM(2); + method = GET_PARAM(3); + is_gt = GET_PARAM(4); + + using namespace cv; + // fill key and val + mat_key = randomMat(Size(input_size, 1), key_type, INT_MIN, INT_MAX); + mat_val = randomMat(Size(input_size, 1), val_type, INT_MIN, INT_MAX); + } +}; + +TEST_P(SortByKey, Accuracy) +{ + using namespace cv; + ocl::oclMat oclmat_key(mat_key); + ocl::oclMat oclmat_val(mat_val); + + ocl::sort_by_key(oclmat_key, oclmat_val, method, is_gt); + SortByKey_STL::sort(mat_key, mat_val, is_gt); + + EXPECT_MAT_NEAR(mat_key, oclmat_key, 0.0); + EXPECT_TRUE(checkUnstableSorterResult(mat_key, mat_val, oclmat_key, oclmat_val)); +} +INSTANTIATE_TEST_CASE_P(OCL_SORT, SortByKey, Combine(INPUT_SIZES, KEY_TYPES, VAL_TYPES, SORT_METHODS, F_OR_T));