Fix thread sync for csbp.

This commit is contained in:
peng xiao 2013-04-23 17:35:40 +08:00
parent c701d54281
commit 9cfa24e515

View File

@ -221,6 +221,8 @@ __kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, _
cur = next;
next = data_cost[(d + 1) * cdisp_step1];
}
for (int i = nr_local_minimum; i < nr_plane; i++)
{
float minimum = FLT_MAX;
@ -402,23 +404,90 @@ __kernel void init_data_cost_reduce_0(__global short *ctemp, __global uchar *cle
__local float* dline = smem + winsz * get_local_id(2);
dline[tid] = val;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
if (winsz >= 256)
{
if (tid < 128)
dline[tid] += dline[tid + 128];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
if (winsz >= 128)
{
if (tid < 64)
dline[tid] += dline[tid + 64];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 64)
if (tid < 32)
vdline[tid] += vdline[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];
if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];
if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];
if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 32)
if (tid < 16)
vdline[tid] += vdline[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d<cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 16)
if (tid < 8)
vdline[tid] += vdline[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d<cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 8)
if (tid < 4)
vdline[tid] += vdline[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d<cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 4)
if (tid < 2)
vdline[tid] += vdline[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d<cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 2)
if (tid < 1)
vdline[tid] += vdline[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
__global short* data_cost = ctemp + y_out * cmsg_step1 + x_out;
if (tid == 0)
data_cost[cdisp_step1 * d] = convert_short_sat_rte(dline[0]);
}
@ -470,23 +539,85 @@ __kernel void init_data_cost_reduce_1(__global float *ctemp, __global uchar *cle
__local float* dline = smem + winsz * get_local_id(2);
dline[tid] = val;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
if (winsz >= 256)
if (tid < 128)
dline[tid] += dline[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
if (winsz >= 128)
if (tid < 64)
dline[tid] += dline[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 64)
if (tid < 32)
vdline[tid] += vdline[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];
if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];
if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];
if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 32)
if (tid < 16)
vdline[tid] += vdline[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 16)
if (tid < 8)
vdline[tid] += vdline[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 8)
if (tid < 4)
vdline[tid] += vdline[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 4)
if (tid < 2)
vdline[tid] += vdline[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 2)
if (tid < 1)
vdline[tid] += vdline[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__global float *data_cost = ctemp + y_out * cmsg_step1 + x_out;
__local float* dline = smem + winsz * get_local_id(2);
if (tid == 0)
data_cost[cdisp_step1 * d] = dline[0];
}
@ -652,21 +783,77 @@ __kernel void compute_data_cost_reduce_0(__global const short* selected_disp_pyr
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
// if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
//if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 64)
{
if (tid < 32)
vdline[tid] += vdline[tid + 32];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];
if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];
if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];
if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 32)
{
if (tid < 16)
vdline[tid] += vdline[tid + 16];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 16)
{
if (tid < 8)
vdline[tid] += vdline[tid + 8];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 8)
{
if (tid < 4)
vdline[tid] += vdline[tid + 4];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 4)
{
if (tid < 2)
vdline[tid] += vdline[tid + 2];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 2)
{
if (tid < 1)
vdline[tid] += vdline[tid + 1];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (tid == 0)
data_cost[cdisp_step1 * d] = convert_short_sat_rte(vdline[0]);
}
@ -727,21 +914,77 @@ __kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
//if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
//if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 64)
{
if (tid < 32)
vdline[tid] += vdline[tid + 32];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8];
if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4];
if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2];
if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1];
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 32)
{
if (tid < 16)
vdline[tid] += vdline[tid + 16];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 16)
{
if (tid < 8)
vdline[tid] += vdline[tid + 8];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 8)
{
if (tid < 4)
vdline[tid] += vdline[tid + 4];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 4)
{
if (tid < 2)
vdline[tid] += vdline[tid + 2];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 2)
{
if (tid < 1)
vdline[tid] += vdline[tid + 1];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (tid == 0)
data_cost[cdisp_step1 * d] = vdline[0];
}
@ -870,7 +1113,7 @@ __kernel void init_message_0(__global short *u_new_, __global short *d_new_, __g
}
}
__kernel void init_message_1(__global float *u_new_, __global float *d_new_, __global float *l_new_,
__global float *r_new_, __global float *u_cur_, __global const float *d_cur_,
__global float *r_new_, __global const float *u_cur_, __global const float *d_cur_,
__global const float *l_cur_, __global const float *r_cur_, __global float *ctemp,
__global float *selected_disp_pyr_new, __global const float *selected_disp_pyr_cur,
__global float *data_cost_selected_, __global const float *data_cost_,
@ -880,8 +1123,7 @@ __kernel void init_message_1(__global float *u_new_, __global float *d_new_, __g
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
__global const float *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
__global const float *d_cur = d_cur_ + max(0, y/2 - 1) * cmsg_step2 + x/2;
__global const float *l_cur = l_cur_ + y/2 * cmsg_step2 + min(w2-1, x/2 + 1);
@ -892,6 +1134,8 @@ __kernel void init_message_1(__global float *u_new_, __global float *d_new_, __g
__global const float *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;
__global const float *data_cost = data_cost_ + y * cmsg_step1 + x;
if (y < h && x < w)
{
for(int d = 0; d < nr_plane2; d++)
{
int idx2 = d * cdisp_step2;
@ -899,6 +1143,7 @@ __kernel void init_message_1(__global float *u_new_, __global float *d_new_, __g
float val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
data_cost_new[d * cdisp_step1] = val;
}
}
__global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
__global float *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;
@ -908,17 +1153,40 @@ __kernel void init_message_1(__global float *u_new_, __global float *d_new_, __g
__global float *l_new = l_new_ + y * cmsg_step1 + x;
__global float *r_new = r_new_ + y * cmsg_step1 + x;
barrier(CLK_LOCAL_MEM_FENCE);
if(y < h && x < w)
{
u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;
d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;
l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;
r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;
get_first_k_element_increase_1(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
data_cost_selected, disparity_selected_new, data_cost_new,
data_cost, disparity_selected_cur, nr_plane, nr_plane2,
cdisp_step1, cdisp_step2);
for(int i = 0; i < nr_plane; i++)
{
float minimum = FLT_MAX;
int id = 0;
for(int j = 0; j < nr_plane2; j++)
{
float cur = data_cost_new[j * cdisp_step1];
if(cur < minimum)
{
minimum = cur;
id = j;
}
}
data_cost_selected[i * cdisp_step1] = data_cost[id * cdisp_step1];
disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
data_cost_new[id * cdisp_step1] = FLT_MAX;
}
}
}
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
@ -1132,4 +1400,3 @@ __kernel void compute_disp_1(__global const float *u_, __global const float *d_,
disp[res_step * y + x] = best;
}
}