moved GPU's global counter reset from caller to the kernel's end

This commit is contained in:
Alexey Spizhevoy 2010-11-29 08:09:54 +00:00
parent 3da253a259
commit 8c48f3be9d

View File

@ -419,10 +419,8 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctwidth;
__constant__ int ctheight;
static const unsigned int czero = 0;
// Global counter of blocks finished its work
__device__ unsigned int blocks_finished;
__device__ unsigned int blocks_finished = 0;
// Estimates good thread configuration
@ -548,6 +546,7 @@ namespace cv { namespace gpu { namespace mathfunc
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
blocks_finished = 0;
}
}
#else
@ -570,7 +569,6 @@ namespace cv { namespace gpu { namespace mathfunc
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
@ -611,6 +609,7 @@ namespace cv { namespace gpu { namespace mathfunc
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
blocks_finished = 0;
}
}
@ -625,7 +624,6 @@ namespace cv { namespace gpu { namespace mathfunc
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf);
min_max_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
@ -654,10 +652,8 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctwidth;
__constant__ int ctheight;
static const unsigned int czero = 0;
// Global counter of blocks finished its work
__device__ unsigned int blocks_finished;
__device__ unsigned int blocks_finished = 0;
// Estimates good thread configuration
@ -810,6 +806,7 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
blocks_finished = 0;
}
}
#else
@ -837,7 +834,6 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0);
unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
@ -890,6 +886,7 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
blocks_finished = 0;
}
}
@ -907,7 +904,6 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0);
unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
@ -943,9 +939,7 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctwidth;
__constant__ int ctheight;
static const unsigned int czero = 0;
__device__ unsigned int blocks_finished;
__device__ unsigned int blocks_finished = 0;
void estimate_thread_cfg(dim3& threads, dim3& grid)
{
@ -1036,7 +1030,11 @@ namespace cv { namespace gpu { namespace mathfunc
sum_is_smem<nthreads, unsigned int>(scount, tid);
if (tid == 0) count[0] = scount[0];
if (tid == 0)
{
count[0] = scount[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
@ -1053,7 +1051,6 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
@ -1081,7 +1078,11 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < size ? count[tid] : 0;
sum_is_smem<nthreads, unsigned int>(scount, tid);
if (tid == 0) count[0] = scount[0];
if (tid == 0)
{
count[0] = scount[0];
blocks_finished = 0;
}
}
@ -1094,7 +1095,6 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_kernel_2ndstep<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());