mirror of
https://github.com/opencv/opencv.git
synced 2024-11-29 05:29:54 +08:00
moved GPU's global counter reset from caller to the kernel's end
This commit is contained in:
parent
3da253a259
commit
8c48f3be9d
@ -419,10 +419,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
__constant__ int ctwidth;
|
||||
__constant__ int ctheight;
|
||||
|
||||
static const unsigned int czero = 0;
|
||||
|
||||
// Global counter of blocks finished its work
|
||||
__device__ unsigned int blocks_finished;
|
||||
__device__ unsigned int blocks_finished = 0;
|
||||
|
||||
|
||||
// Estimates good thread configuration
|
||||
@ -548,6 +546,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
{
|
||||
minval[0] = (T)sminval[0];
|
||||
maxval[0] = (T)smaxval[0];
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
#else
|
||||
@ -570,7 +569,6 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
T* minval_buf = (T*)buf.ptr(0);
|
||||
T* maxval_buf = (T*)buf.ptr(1);
|
||||
|
||||
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
|
||||
min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
@ -611,6 +609,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
{
|
||||
minval[0] = (T)sminval[0];
|
||||
maxval[0] = (T)smaxval[0];
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -625,7 +624,6 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
T* minval_buf = (T*)buf.ptr(0);
|
||||
T* maxval_buf = (T*)buf.ptr(1);
|
||||
|
||||
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
|
||||
min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf);
|
||||
min_max_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -654,10 +652,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
__constant__ int ctwidth;
|
||||
__constant__ int ctheight;
|
||||
|
||||
static const unsigned int czero = 0;
|
||||
|
||||
// Global counter of blocks finished its work
|
||||
__device__ unsigned int blocks_finished;
|
||||
__device__ unsigned int blocks_finished = 0;
|
||||
|
||||
|
||||
// Estimates good thread configuration
|
||||
@ -810,6 +806,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
maxval[0] = (T)smaxval[0];
|
||||
minloc[0] = sminloc[0];
|
||||
maxloc[0] = smaxloc[0];
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
#else
|
||||
@ -837,7 +834,6 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0);
|
||||
unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1);
|
||||
|
||||
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
|
||||
min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
@ -890,6 +886,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
maxval[0] = (T)smaxval[0];
|
||||
minloc[0] = sminloc[0];
|
||||
maxloc[0] = smaxloc[0];
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -907,7 +904,6 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0);
|
||||
unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1);
|
||||
|
||||
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
|
||||
min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf);
|
||||
min_max_loc_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
@ -943,9 +939,7 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
__constant__ int ctwidth;
|
||||
__constant__ int ctheight;
|
||||
|
||||
static const unsigned int czero = 0;
|
||||
|
||||
__device__ unsigned int blocks_finished;
|
||||
__device__ unsigned int blocks_finished = 0;
|
||||
|
||||
void estimate_thread_cfg(dim3& threads, dim3& grid)
|
||||
{
|
||||
@ -1036,7 +1030,11 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
sum_is_smem<nthreads, unsigned int>(scount, tid);
|
||||
|
||||
if (tid == 0) count[0] = scount[0];
|
||||
if (tid == 0)
|
||||
{
|
||||
count[0] = scount[0];
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
|
||||
@ -1053,7 +1051,6 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
|
||||
|
||||
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
|
||||
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
@ -1081,7 +1078,11 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
scount[tid] = tid < size ? count[tid] : 0;
|
||||
sum_is_smem<nthreads, unsigned int>(scount, tid);
|
||||
|
||||
if (tid == 0) count[0] = scount[0];
|
||||
if (tid == 0)
|
||||
{
|
||||
count[0] = scount[0];
|
||||
blocks_finished = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1094,7 +1095,6 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
|
||||
|
||||
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
|
||||
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
|
||||
count_non_zero_kernel_2ndstep<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
Loading…
Reference in New Issue
Block a user