Merge pull request #679 from jet47:gpu-simd-functions

This commit is contained in:
Andrey Kamaev 2013-03-21 14:59:55 +04:00 committed by OpenCV Buildbot
commit d756de176f
3 changed files with 1144 additions and 823 deletions

View File

@ -0,0 +1,910 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2010-2013, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/*
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* Neither the name of NVIDIA Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
#define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
#include "common.hpp"
/*
This header file contains inline functions that implement intra-word SIMD
operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
to make the code portable across all GPUs supported by CUDA. The following
functions are currently implemented:
vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b
vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b
vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b|
vavg2(a,b) per-halfword unsigned average: (a + b) / 2
vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0
vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0
vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0
vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0
vmax2(a,b) per-halfword unsigned maximum: max(a, b)
vmin2(a,b) per-halfword unsigned minimum: min(a, b)
vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b
vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b
vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b|
vavg4(a,b) per-byte unsigned average: (a + b) / 2
vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0
vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0
vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0
vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0
vmax4(a,b) per-byte unsigned maximum: max(a, b)
vmin4(a,b) per-byte unsigned minimum: min(a, b)
*/
namespace cv { namespace gpu { namespace device
{
// 2
static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = a ^ b; // sum bits
r = a + b; // actual sum
s = s ^ r; // determine carry-ins for each bit position
s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
r = r - s; // subtract out carry-out from low word
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = a ^ b; // sum bits
r = a - b; // actual sum
s = s ^ r; // determine carry-ins for each bit position
s = s & 0x00010000; // borrow to high word
r = r + s; // compensate for borrow from low word
#endif
return r;
}
static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t, u, v;
s = a & 0x0000ffff; // extract low halfword
r = b & 0x0000ffff; // extract low halfword
u = ::max(r, s); // maximum of low halfwords
v = ::min(r, s); // minimum of low halfwords
s = a & 0xffff0000; // extract high halfword
r = b & 0xffff0000; // extract high halfword
t = ::max(r, s); // maximum of high halfwords
s = ::min(r, s); // minimum of high halfwords
r = u | t; // maximum of both halfwords
s = v | s; // minimum of both halfwords
r = r - s; // |a - b| = max(a,b) - min(a,b);
#endif
return r;
}
static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
{
unsigned int r, s;
// HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
// (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
s = a ^ b;
r = a & b;
s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
s = s >> 1;
s = r + s;
return s;
}
static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
// (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
unsigned int s;
s = a ^ b;
r = a | b;
s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
s = s >> 1;
r = r - s;
#endif
return r;
}
static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
unsigned int c;
r = a ^ b; // 0x0000 if a == b
c = r | 0x80008000; // set msbs, to catch carry out
r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
c = r & ~c; // msb = 1, if r was 0x0000
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vseteq2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
r = a ^ b; // 0x0000 if a == b
c = r | 0x80008000; // set msbs, to catch carry out
r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
c = r & ~c; // msb = 1, if r was 0x0000
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(b));
c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetge2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
asm("not.b32 %0, %0;" : "+r"(b));
c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(b));
c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
c = c & 0x80008000; // msbs = carry-outs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetgt2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
asm("not.b32 %0, %0;" : "+r"(b));
c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
c = c & 0x80008000; // msbs = carry-outs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(a));
c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetle2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
asm("not.b32 %0, %0;" : "+r"(a));
c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(a));
c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetlt2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
asm("not.b32 %0, %0;" : "+r"(a));
c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
unsigned int c;
r = a ^ b; // 0x0000 if a == b
c = r | 0x80008000; // set msbs, to catch carry out
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
c = r | c; // msb = 1, if r was not 0x0000
c = c & 0x80008000; // extract msbs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetne2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
r = a ^ b; // 0x0000 if a == b
c = r | 0x80008000; // set msbs, to catch carry out
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
c = r | c; // msb = 1, if r was not 0x0000
c = c & 0x80008000; // extract msbs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t, u;
r = a & 0x0000ffff; // extract low halfword
s = b & 0x0000ffff; // extract low halfword
t = ::max(r, s); // maximum of low halfwords
r = a & 0xffff0000; // extract high halfword
s = b & 0xffff0000; // extract high halfword
u = ::max(r, s); // maximum of high halfwords
r = t | u; // combine halfword maximums
#endif
return r;
}
static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t, u;
r = a & 0x0000ffff; // extract low halfword
s = b & 0x0000ffff; // extract low halfword
t = ::min(r, s); // minimum of low halfwords
r = a & 0xffff0000; // extract high halfword
s = b & 0xffff0000; // extract high halfword
u = ::min(r, s); // minimum of high halfwords
r = t | u; // combine halfword minimums
#endif
return r;
}
// 4
static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t;
s = a ^ b; // sum bits
r = a & 0x7f7f7f7f; // clear msbs
t = b & 0x7f7f7f7f; // clear msbs
s = s & 0x80808080; // msb sum bits
r = r + t; // add without msbs, record carry-out in msbs
r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
#endif /* __CUDA_ARCH__ >= 300 */
return r;
}
static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t;
s = a ^ ~b; // inverted sum bits
r = a | 0x80808080; // set msbs
t = b & 0x7f7f7f7f; // clear msbs
s = s & 0x80808080; // inverted msb sum bits
r = r - t; // subtract w/o msbs, record inverted borrows in msb
r = r ^ s; // combine inverted msb sum bits and borrows
#endif
return r;
}
static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
{
unsigned int r, s;
// HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
// (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
s = a ^ b;
r = a & b;
s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
s = s >> 1;
s = r + s;
return s;
}
static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
// (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
unsigned int c;
c = a ^ b;
r = a | b;
c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
c = c >> 1;
r = r - c;
#endif
return r;
}
static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
unsigned int c;
r = a ^ b; // 0x00 if a == b
c = r | 0x80808080; // set msbs, to catch carry out
r = r ^ c; // extract msbs, msb = 1 if r < 0x80
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
c = r & ~c; // msb = 1, if r was 0x00
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
{
unsigned int r, t;
#if __CUDA_ARCH__ >= 300
r = vseteq4(a, b);
t = r << 8; // convert bool
r = t - r; // to mask
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
t = a ^ b; // 0x00 if a == b
r = t | 0x80808080; // set msbs, to catch carry out
t = t ^ r; // extract msbs, msb = 1 if t < 0x80
r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
r = t & ~r; // msb = 1, if t was 0x00
t = r >> 7; // build mask
t = r - t; // from
r = t | r; // msbs
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(a));
c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetle4(a, b);
c = r << 8; // convert bool
r = c - r; // to mask
#else
asm("not.b32 %0, %0;" : "+r"(a));
c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
c = c & 0x80808080; // msbs = carry-outs
r = c >> 7; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(a));
c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetlt4(a, b);
c = r << 8; // convert bool
r = c - r; // to mask
#else
asm("not.b32 %0, %0;" : "+r"(a));
c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
c = c & 0x80808080; // msbs = carry-outs
r = c >> 7; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(b));
c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
{
unsigned int r, s;
#if __CUDA_ARCH__ >= 300
r = vsetge4(a, b);
s = r << 8; // convert bool
r = s - r; // to mask
#else
asm ("not.b32 %0,%0;" : "+r"(b));
r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
r = r & 0x80808080; // msb = carry-outs
s = r >> 7; // build mask
s = r - s; // from
r = s | r; // msbs
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(b));
c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetgt4(a, b);
c = r << 8; // convert bool
r = c - r; // to mask
#else
asm("not.b32 %0, %0;" : "+r"(b));
c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
unsigned int c;
r = a ^ b; // 0x00 if a == b
c = r | 0x80808080; // set msbs, to catch carry out
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
c = r | c; // msb = 1, if r was not 0x00
c = c & 0x80808080; // extract msbs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetne4(a, b);
c = r << 8; // convert bool
r = c - r; // to mask
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
r = a ^ b; // 0x00 if a == b
c = r | 0x80808080; // set msbs, to catch carry out
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
c = r | c; // msb = 1, if r was not 0x00
c = c & 0x80808080; // extract msbs
r = c >> 7; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = vcmpge4(a, b); // mask = 0xff if a >= b
r = a ^ b; //
s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
r = s ^ r; // select a when b >= a, else select b => min(a,b)
r = s - r; // |a - b| = max(a,b) - min(a,b);
#endif
return r;
}
static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = vcmpge4(a, b); // mask = 0xff if a >= b
r = a & s; // select a when b >= a
s = b & ~s; // select b when b < a
r = r | s; // combine byte selections
#endif
return r; // byte-wise unsigned maximum
}
static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = vcmpge4(b, a); // mask = 0xff if a >= b
r = a & s; // select a when b >= a
s = b & ~s; // select b when b < a
r = r | s; // combine byte selections
#endif
return r;
}
}}}
#endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__

View File

@ -48,6 +48,7 @@
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/simd_functions.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
@ -154,170 +155,28 @@ namespace arithm
namespace arithm
{
template <typename T, typename D> struct VAdd4;
template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
struct VAdd4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vadd4(a, b);
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
};
template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
{
__device__ __forceinline__ uint operator ()(int a, int b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
};
template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
{
__device__ __forceinline__ int operator ()(uint a, uint b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
};
template <> struct VAdd4<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
__device__ __forceinline__ VAdd4(const VAdd4& other) {}
};
////////////////////////////////////
template <typename T, typename D> struct VAdd2;
template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
struct VAdd2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vadd2(a, b);
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
};
template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
{
__device__ __forceinline__ int operator ()(uint a, uint b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
};
template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
{
__device__ __forceinline__ uint operator ()(int a, int b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
};
template <> struct VAdd2<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
__device__ __forceinline__ VAdd2(const VAdd2& other) {}
};
////////////////////////////////////
@ -336,13 +195,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@ -355,28 +214,16 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T, typename D>
void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
}
template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
}
template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
@ -543,170 +390,28 @@ namespace arithm
namespace arithm
{
template <typename T, typename D> struct VSub4;
template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
struct VSub4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vsub4(a, b);
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
};
template <> struct VSub4<int, uint> : binary_function<int, int, uint>
{
__device__ __forceinline__ uint operator ()(int a, int b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
};
template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
{
__device__ __forceinline__ int operator ()(uint a, uint b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
};
template <> struct VSub4<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
__device__ __forceinline__ VSub4(const VSub4& other) {}
};
////////////////////////////////////
template <typename T, typename D> struct VSub2;
template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
struct VSub2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vsub2(a, b);
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
};
template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
{
__device__ __forceinline__ int operator ()(uint a, uint b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
};
template <> struct VSub2<int, uint> : binary_function<int, int, uint>
{
__device__ __forceinline__ uint operator ()(int a, int b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
};
template <> struct VSub2<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
__device__ __forceinline__ VSub2(const VSub2& other) {}
};
////////////////////////////////////
@ -725,13 +430,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@ -744,28 +449,16 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T, typename D>
void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
}
template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
}
template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
@ -1496,90 +1189,28 @@ namespace arithm
namespace arithm
{
template <typename T, typename D> struct VAbsDiff4;
template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
struct VAbsDiff4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vabsdiff4(a, b);
}
__device__ __forceinline__ VAbsDiff4() {}
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
};
template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAbsDiff4() {}
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
};
////////////////////////////////////
template <typename T, typename D> struct VAbsDiff2;
template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
struct VAbsDiff2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vabsdiff2(a, b);
}
__device__ __forceinline__ VAbsDiff2() {}
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
};
template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAbsDiff2() {}
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
};
////////////////////////////////////
@ -1611,13 +1242,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@ -1630,24 +1261,16 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T>
void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
}
template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T>
void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
}
template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
@ -1877,6 +1500,49 @@ namespace arithm
namespace arithm
{
struct VCmpEq4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpeq4(a, b);
}
__device__ __forceinline__ VCmpEq4() {}
__device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
};
struct VCmpNe4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpne4(a, b);
}
__device__ __forceinline__ VCmpNe4() {}
__device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
};
struct VCmpLt4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmplt4(a, b);
}
__device__ __forceinline__ VCmpLt4() {}
__device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
};
struct VCmpLe4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmple4(a, b);
}
__device__ __forceinline__ VCmpLe4() {}
__device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
};
////////////////////////////////////
template <class Op, typename T>
struct Cmp : binary_function<T, T, uchar>
{
@ -1890,6 +1556,21 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
{
};
@ -1897,6 +1578,23 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
}
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
}
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
}
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
}
template <template <typename> class Op, typename T>
void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
@ -2303,44 +2001,11 @@ namespace arithm
namespace arithm
{
template <typename T> struct VMin4;
template <> struct VMin4<uint> : binary_function<uint, uint, uint>
struct VMin4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VMin4() {}
__device__ __forceinline__ VMin4(const VMin4& other) {}
};
template <> struct VMin4<int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vmin4(a, b);
}
__device__ __forceinline__ VMin4() {}
@ -2349,40 +2014,11 @@ namespace arithm
////////////////////////////////////
template <typename T> struct VMin2;
template <> struct VMin2<uint> : binary_function<uint, uint, uint>
struct VMin2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VMin2() {}
__device__ __forceinline__ VMin2(const VMin2& other) {}
};
template <> struct VMin2<int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vmin2(a, b);
}
__device__ __forceinline__ VMin2() {}
@ -2392,13 +2028,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T> struct TransformFunctorTraits< arithm::VMin4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T> struct TransformFunctorTraits< arithm::VMin2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@ -2415,14 +2051,14 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
}
template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
}
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@ -2430,12 +2066,6 @@ namespace arithm
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
}
template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@ -2463,44 +2093,11 @@ namespace arithm
namespace arithm
{
template <typename T> struct VMax4;
template <> struct VMax4<uint> : binary_function<uint, uint, uint>
struct VMax4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VMax4() {}
__device__ __forceinline__ VMax4(const VMax4& other) {}
};
template <> struct VMax4<int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vmax4(a, b);
}
__device__ __forceinline__ VMax4() {}
@ -2509,40 +2106,11 @@ namespace arithm
////////////////////////////////////
template <typename T> struct VMax2;
template <> struct VMax2<uint> : binary_function<uint, uint, uint>
struct VMax2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VMax2() {}
__device__ __forceinline__ VMax2(const VMax2& other) {}
};
template <> struct VMax2<int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vmax2(a, b);
}
__device__ __forceinline__ VMax2() {}
@ -2552,13 +2120,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T> struct TransformFunctorTraits< arithm::VMax4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T> struct TransformFunctorTraits< arithm::VMax2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@ -2575,14 +2143,14 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
}
template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
}
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@ -2590,12 +2158,6 @@ namespace arithm
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
}
template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);

View File

@ -263,11 +263,8 @@ namespace
namespace arithm
{
template <typename T, typename D>
void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T, typename D>
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -345,62 +342,6 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
}
};
typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const vfunc_t vfuncs4[4][4] =
{
{
vadd4<unsigned int, unsigned int>,
vadd4<unsigned int, int>,
0,
0
},
{
vadd4<int, unsigned int>,
vadd4<int, int>,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
0,
0
}
};
static const vfunc_t vfuncs2[4][4] =
{
{
0,
0,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
vadd2<unsigned int, unsigned int>,
vadd2<unsigned int, int>
},
{
0,
0,
vadd2<int, unsigned int>,
vadd2<int, int>
}
};
if (dtype < 0)
dtype = src1.depth();
@ -426,7 +367,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@ -434,30 +375,26 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
@ -606,11 +543,8 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
namespace arithm
{
template <typename T, typename D>
void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void subMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void subMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T, typename D>
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -688,62 +622,6 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
}
};
typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const vfunc_t vfuncs4[4][4] =
{
{
vsub4<unsigned int, unsigned int>,
vsub4<unsigned int, int>,
0,
0
},
{
vsub4<int, unsigned int>,
vsub4<int, int>,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
0,
0
}
};
static const vfunc_t vfuncs2[4][4] =
{
{
0,
0,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
vsub2<unsigned int, unsigned int>,
vsub2<unsigned int, int>
},
{
0,
0,
vsub2<int, unsigned int>,
vsub2<int, int>
}
};
if (dtype < 0)
dtype = src1.depth();
@ -769,7 +647,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@ -777,30 +655,26 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
@ -1585,11 +1459,8 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
namespace arithm
{
template <typename T>
void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T>
void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void absDiffMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void absDiffMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@ -1610,20 +1481,6 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
absDiffMat<float>,
absDiffMat<double>
};
static const func_t vfuncs4[] =
{
vabsDiff4<unsigned int>,
vabsDiff4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vabsDiff2<unsigned int>,
vabsDiff2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@ -1645,7 +1502,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@ -1653,30 +1510,26 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
@ -1940,6 +1793,11 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
namespace arithm
{
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@ -1962,6 +1820,12 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
{cmpMatEq<double> , cmpMatNe<double> , cmpMatLt<double> , cmpMatLe<double> }
};
typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
static const func_v4_t funcs_v4[] =
{
cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
};
const int depth = src1.depth();
const int cn = src1.channels();
@ -1997,6 +1861,27 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
const int vcols = src1_.cols >> 2;
funcs_v4[code](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
}
const func_t func = funcs[depth][code];
func(src1_, src2_, dst_, stream);
@ -2532,13 +2417,13 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
namespace arithm
{
template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void minMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void maxMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void maxMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
@ -2558,20 +2443,6 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
minMat<float>,
minMat<double>
};
static const func_t vfuncs4[] =
{
vmin4<unsigned int>,
vmin4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vmin2<unsigned int>,
vmin2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@ -2593,7 +2464,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@ -2601,30 +2472,26 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
@ -2655,20 +2522,6 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
maxMat<float>,
maxMat<double>
};
static const func_t vfuncs4[] =
{
vmax4<unsigned int>,
vmax4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vmax2<unsigned int>,
vmax2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@ -2690,7 +2543,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@ -2698,30 +2551,26 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;