diff --git a/modules/core/include/opencv2/core/cuda/detail/reduce.hpp b/modules/core/include/opencv2/core/cuda/detail/reduce.hpp index 8af20b0dc8..05a672c3dc 100644 --- a/modules/core/include/opencv2/core/cuda/detail/reduce.hpp +++ b/modules/core/include/opencv2/core/cuda/detail/reduce.hpp @@ -134,6 +134,22 @@ namespace cv { namespace cuda { namespace device { val = smem[tid]; } + + template + __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op) + { + T reg = smem[tid + delta]; + smem[tid] = val = op(val, reg); + } + + template + __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op) + { + T reg = shfl_down(val, delta, width); + val = op(val, reg); + } + +#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690 template __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, @@ -142,6 +158,7 @@ namespace cv { namespace cuda { namespace device { For<0, thrust::tuple_size >::value>::loadToSmem(smem, val, tid); } + template __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, @@ -151,18 +168,6 @@ namespace cv { namespace cuda { namespace device For<0, thrust::tuple_size >::value>::loadFromSmem(smem, val, tid); } - template - __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op) - { - T reg = smem[tid + delta]; - smem[tid] = val = op(val, reg); - } - template - __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op) - { - T reg = shfl_down(val, delta, width); - val = op(val, reg); - } template @@ -183,7 +188,31 @@ namespace cv { namespace cuda { namespace device { For<0, thrust::tuple_size >::value>::mergeShfl(val, delta, width, op); } +#else + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, const thrust::tuple& val, unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, val, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, const thrust::tuple& val, unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void merge(const thrust::tuple& smem, const thrust::tuple& val, unsigned int tid, unsigned int delta, const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::merge(smem, val, tid, delta, op); + } + + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& val, unsigned int delta, unsigned int width, const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::mergeShfl(val, delta, width, op); + } +#endif template struct Generic { template diff --git a/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp b/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp index df37c173be..4a248c8365 100644 --- a/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp +++ b/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp @@ -177,6 +177,8 @@ namespace cv { namespace cuda { namespace device { data = smem[tid]; } + +#if (CUDART_VERSION < 12040) template __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, @@ -193,9 +195,18 @@ namespace cv { namespace cuda { namespace device { For<0, thrust::tuple_size >::value>::loadFromSmem(smem, data, tid); } - - ////////////////////////////////////////////////////// - // copyVals +#else + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, const thrust::tuple& data, unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, data, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, const thrust::tuple& data, unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, data, tid); + } +#endif template __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width) @@ -207,24 +218,6 @@ namespace cv { namespace cuda { namespace device { svals[tid] = val = svals[tid + delta]; } - template - __device__ __forceinline__ void copyValsShfl(const thrust::tuple& val, - unsigned int delta, - int width) - { - For<0, thrust::tuple_size >::value>::copyShfl(val, delta, width); - } - template - __device__ __forceinline__ void copyVals(const thrust::tuple& svals, - const thrust::tuple& val, - unsigned int tid, unsigned int delta) - { - For<0, thrust::tuple_size >::value>::copy(svals, val, tid, delta); - } - - ////////////////////////////////////////////////////// - // merge template __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width) @@ -248,6 +241,24 @@ namespace cv { namespace cuda { namespace device copyVals(svals, val, tid, delta); } } + +#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690 + template + __device__ __forceinline__ void copyValsShfl(const thrust::tuple& val, + unsigned int delta, + int width) + { + For<0, thrust::tuple_size >::value>::copyShfl(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::copy(svals, val, tid, delta); + } + template @@ -305,7 +316,61 @@ namespace cv { namespace cuda { namespace device { For<0, thrust::tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); } +#else + template + __device__ __forceinline__ void copyValsShfl(const thrust::tuple& val, unsigned int delta, int width) + { + For<0, thrust::tuple_size >::value>::copyShfl(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(const thrust::tuple& svals, const thrust::tuple& val, unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::copy(svals, val, tid, delta); + } + template + __device__ __forceinline__ void mergeShfl(K& key, const thrust::tuple& val, const Cmp& cmp, unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, const thrust::tuple& svals, + const thrust::tuple& val, const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& key, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int delta, int width) + { + For<0, thrust::tuple_size >::value>::mergeShfl(key, val, cmp, delta, width); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); + } + +#endif ////////////////////////////////////////////////////// // Generic diff --git a/modules/core/include/opencv2/core/cuda/reduce.hpp b/modules/core/include/opencv2/core/cuda/reduce.hpp index 5de3650817..fb74de95a8 100644 --- a/modules/core/include/opencv2/core/cuda/reduce.hpp +++ b/modules/core/include/opencv2/core/cuda/reduce.hpp @@ -64,6 +64,12 @@ namespace cv { namespace cuda { namespace device { reduce_detail::Dispatcher::reductor::template reduce(smem, val, tid, op); } + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce(skeys, key, svals, val, tid, cmp); + } +#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690 template &>(smem, val, tid, op); } - template - __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp) - { - reduce_key_val_detail::Dispatcher::reductor::template reduce(skeys, key, svals, val, tid, cmp); - } template &, const Cmp&>(skeys, key, svals, val, tid, cmp); } + template & >(skeys, key, svals, val, tid, cmp); } +#else + template + __device__ __forceinline__ void reduce(const thrust::tuple& smem, const thrust::tuple& val, unsigned int tid, const thrust::tuple& op) + { + reduce_detail::Dispatcher::reductor::template reduce&, const thrust::tuple&, const thrust::tuple&>(smem, val, tid, op); + } + + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, const thrust::tuple& svals, const thrust::tuple& val, unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce&, const thrust::tuple&, const Cmp&>(skeys, key, svals, val, tid, cmp); + } + + template + __device__ __forceinline__ void reduceKeyVal(const thrust::tuple& skeys, const thrust::tuple& key, const thrust::tuple& svals, const thrust::tuple& val, unsigned int tid, const thrust::tuple& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce&, const thrust::tuple&, const thrust::tuple&, const thrust::tuple&, const thrust::tuple&>(skeys, key, svals, val, tid, cmp); + } +#endif // smem_tuple