Merge pull request #24813 from fengyuentau:speedup_scatter

dnn: improve scatter and scatterND speed with multi-threading
This commit is contained in:
Alexander Smorkalov 2024-01-17 17:16:50 +03:00 committed by GitHub
commit ac4c0bffac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 241 additions and 242 deletions

View File

@ -258,22 +258,21 @@ PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
test_slice<4>(inputShape, begin, end); test_slice<4>(inputShape, begin, end);
} }
struct Layer_Scatter : public TestBaseWithParam<tuple<Backend, Target> > using Layer_Scatter = TestBaseWithParam<tuple<std::vector<int>, std::string, int, tuple<Backend, Target>>>;
{ PERF_TEST_P_(Layer_Scatter, scatter) {
void test_layer(const std::vector<int>& shape, const String reduction = "none", int axis = 0) std::vector<int> shape = get<0>(GetParam());
{ std::string reduction = get<1>(GetParam());
int backendId = get<0>(GetParam()); int axis = get<2>(GetParam());
int targetId = get<1>(GetParam()); int backend_id = get<0>(get<3>(GetParam()));
int target_id = get<1>(get<3>(GetParam()));
Mat data(shape, CV_32FC1); Mat data(shape, CV_32FC1);
Mat indices(shape, CV_32FC1); Mat indices(shape, CV_32FC1);
Mat updates(shape, CV_32FC1); Mat updates(shape, CV_32FC1);
Scalar mean = 0.f; randn(data, 0.f, 1.f);
Scalar std = 1.f;
randn(data, mean, std);
randu(indices, 0, shape[axis]); randu(indices, 0, shape[axis]);
randn(updates, mean, std); randn(updates, 0.f, 1.f);
indices.convertTo(indices, CV_32SC1, 1, -1); indices.convertTo(indices, CV_32SC1, 1, -1);
@ -291,20 +290,18 @@ struct Layer_Scatter : public TestBaseWithParam<tuple<Backend, Target> >
// warmup // warmup
{ {
std::vector<String> inpNames(3); std::vector<String> input_names{"data", "indices", "updates"};
inpNames[0] = "data"; net.setInputsNames(input_names);
inpNames[1] = "indices"; net.setInput(data, input_names[0]);
inpNames[2] = "updates"; net.setInput(indices, input_names[1]);
net.setInputsNames(inpNames); net.setInput(updates, input_names[2]);
net.setInput(data, inpNames[0]);
net.setInput(indices, inpNames[1]);
net.setInput(updates, inpNames[2]);
net.setPreferableBackend(backendId); net.setPreferableBackend(backend_id);
net.setPreferableTarget(targetId); net.setPreferableTarget(target_id);
Mat out = net.forward(); Mat out = net.forward();
} }
// perf
TEST_CYCLE() TEST_CYCLE()
{ {
Mat res = net.forward(); Mat res = net.forward();
@ -313,28 +310,26 @@ struct Layer_Scatter : public TestBaseWithParam<tuple<Backend, Target> >
SANITY_CHECK_NOTHING(); SANITY_CHECK_NOTHING();
} }
int N = 8; INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, Combine(
int C = 256; Values(std::vector<int>{2, 128, 64, 50}),
int H = 128; Values(std::string("none"), std::string("add")),
int W = 100; Values(0), // use Values(0, 1, 2, 3) for more details
}; dnnBackendsAndTargets(/* withInferenceEngine= */ false,
/* withHalide= */ false,
/* withCpuOCV= */ true,
/* withVkCom= */ false,
/* withCUDA= */ false,
/* withNgraph= */ false,
/* withWebnn= */ false,
/* withCann= */ false) // only test on CPU
));
PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter) using Layer_ScatterND = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
{ PERF_TEST_P_(Layer_ScatterND, scatterND) {
test_layer({N, C, H, W}); std::vector<int> shape = get<0>(GetParam());
} std::string reduction = get<1>(GetParam());
int backend_id = get<0>(get<2>(GetParam()));
PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter_add) int target_id = get<1>(get<2>(GetParam()));
{
test_layer({N, C, H, W}, "add");
}
struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
{
void test_layer(const std::vector<int>& shape, const String reduction = "none")
{
int backendId = get<0>(GetParam());
int targetId = get<1>(GetParam());
std::vector<int> indices_shape(shape); std::vector<int> indices_shape(shape);
indices_shape.push_back(int(shape.size())); indices_shape.push_back(int(shape.size()));
@ -342,12 +337,10 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
Mat indices(indices_shape, CV_32FC1); Mat indices(indices_shape, CV_32FC1);
Mat updates(shape, CV_32FC1); Mat updates(shape, CV_32FC1);
Scalar mean = 0.f; randn(data, 0.f, 1.f);
Scalar std = 1.f; randn(updates, 0.f, 1.f);
randn(data, mean, std);
randn(updates, mean, std);
// initialize the indices with index tuples like [0...N, 0...C, 0...H, 0...W] // Create indices such that indices[n_i, c_j, h_k, w_l, :4] = [i, j, k, l]
std::vector<int> current_index_tuple(shape.size()); std::vector<int> current_index_tuple(shape.size());
int total = data.total(); int total = data.total();
std::vector<int> indices_step; std::vector<int> indices_step;
@ -357,6 +350,7 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
indices_step.push_back(step); indices_step.push_back(step);
} }
int t, j, idx, offset_at_idx, offset; int t, j, idx, offset_at_idx, offset;
auto *indices_ptr = indices.ptr<float>();
for (int i = 0; i < total; i++) for (int i = 0; i < total; i++)
{ {
t = i; t = i;
@ -373,7 +367,7 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
offset += current_index_tuple[j] * indices_step[j]; offset += current_index_tuple[j] * indices_step[j];
for (j = 0; j < shape.size(); j++) for (j = 0; j < shape.size(); j++)
indices.at<float>(offset + j) = current_index_tuple[j]; indices_ptr[offset + j] = current_index_tuple[j];
} }
Net net; Net net;
@ -389,17 +383,14 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
// warmup // warmup
{ {
std::vector<String> inpNames(3); std::vector<String> input_names{"data", "indices", "updates"};
inpNames[0] = "data"; net.setInputsNames(input_names);
inpNames[1] = "indices"; net.setInput(data, input_names[0]);
inpNames[2] = "updates"; net.setInput(indices, input_names[1]);
net.setInputsNames(inpNames); net.setInput(updates, input_names[2]);
net.setInput(data, inpNames[0]);
net.setInput(indices, inpNames[1]);
net.setInput(updates, inpNames[2]);
net.setPreferableBackend(backendId); net.setPreferableBackend(backend_id);
net.setPreferableTarget(targetId); net.setPreferableTarget(target_id);
Mat out = net.forward(); Mat out = net.forward();
} }
@ -411,21 +402,18 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
SANITY_CHECK_NOTHING(); SANITY_CHECK_NOTHING();
} }
int N = 8; INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, Combine(
int C = 256; Values(std::vector<int>{2, 128, 64, 50}),
int H = 128; Values(std::string("none"), std::string("add")),
int W = 100; dnnBackendsAndTargets(/* withInferenceEngine= */ false,
}; /* withHalide= */ false,
/* withCpuOCV= */ true,
PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND) /* withVkCom= */ false,
{ /* withCUDA= */ false,
test_layer({N, C, H ,W}); /* withNgraph= */ false,
} /* withWebnn= */ false,
/* withCann= */ false) // only test on CPU
PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND_add) ));
{
test_layer({N, C, H , W}, "add");
}
struct Layer_LayerNorm : public TestBaseWithParam<tuple<Backend, Target> > struct Layer_LayerNorm : public TestBaseWithParam<tuple<Backend, Target> >
{ {
@ -860,8 +848,6 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA))); INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
#endif #endif
INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));

View File

@ -89,49 +89,59 @@ public:
// NOTE: This impl does not check whether indices have duplicate entries. // NOTE: This impl does not check whether indices have duplicate entries.
// The last duplicate entry will overwrite the previous. // The last duplicate entry will overwrite the previous.
template<typename T, typename Functor> template<typename T, typename Functor>
void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out) void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat& output_mat) {
{ input_mat.copyTo(output_mat);
data.copyTo(out);
const int* shape = data.size.p; const auto &input_mat_shape = shape(input_mat);
const size_t* step = data.step.p; std::vector<size_t> input_mat_step(input_mat_shape.size());
for (int i = 0; i < input_mat.dims; i++) {
input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
}
const int ind_ndims = indices.dims; const int indices_mat_ndims = indices_mat.dims;
const int* ind_shape = indices.size.p; const auto &indices_mat_shape = shape(indices_mat);
const T* p_indices = indices.ptr<const T>();
const int upd_ndims = updates.dims; const int updates_mat_ndims = updates_mat.dims;
const int* upd_shape = updates.size.p; const auto &updates_mat_shape = shape(updates_mat);
const T* p_updates = updates.ptr<const T>();
T* p_out = out.ptr<T>(); int indices_last_dim = indices_mat_shape[indices_mat_ndims - 1]; // last dim of indices
int k = ind_shape[ind_ndims - 1]; // last dim of indices
size_t total = (size_t)(indices.total() / k);
size_t updates_size = 1; size_t updates_size = 1;
for (int i = ind_ndims - 1; i < upd_ndims; i++) for (int i = indices_mat_ndims - 1; i < updates_mat_ndims; i++)
updates_size *= upd_shape[i]; updates_size *= updates_mat_shape[i];
size_t inp_start_offset = 0; auto fn = [&](const Range &r) {
size_t ind_start_offset = 0; size_t input_offset = 0,
size_t upd_start_offset = 0; indices_offset = r.start * indices_last_dim,
for (size_t i = 0; i < total; i++, ind_start_offset += k, upd_start_offset += updates_size) updates_offset = r.start * updates_size;
{ for (int i = r.start; i < r.end; i++) {
const T* tmp_p_indices = p_indices + ind_start_offset; const T* indices = indices_mat.ptr<const T>();
inp_start_offset = 0; const T* updates = updates_mat.ptr<const T>();
for (int j = 0; j < k; j++) T* output = output_mat.ptr<T>();
{
CV_Assert(tmp_p_indices[j] < shape[j] && tmp_p_indices[j] > -shape[j]);
inp_start_offset += (((int)tmp_p_indices[j] + shape[j]) % shape[j]) * step[j];
}
inp_start_offset /= sizeof(T);
const T* tmp_p_updates = p_updates + upd_start_offset; input_offset = 0;
T* tmp_p_out = p_out + inp_start_offset; indices += indices_offset;
for (int j = 0; j < updates_size; j++) for (int j = 0; j < indices_last_dim; j++) {
tmp_p_out[j] = rd(tmp_p_out[j], tmp_p_updates[j]); int index = static_cast<int>(*(indices + j));
index = (index + input_mat_shape[j]) % input_mat_shape[j];
CV_Assert(index < input_mat_shape[j] && index >= 0);
input_offset += index * input_mat_step[j];
} }
updates += updates_offset;
output += input_offset;
for (int j = 0; j < updates_size; j++) {
output[j] = reduce_operation(output[j], updates[j]);
}
indices_offset += indices_last_dim;
updates_offset += updates_size;
}
};
size_t total = (size_t)(indices_mat.total() / indices_last_dim);
double nstripes = (size_t)total * (indices_last_dim + updates_size) * (1 / 1024.0);
parallel_for_(Range(0, total), fn, nstripes);
} }
template<typename... Args> template<typename... Args>

View File

@ -81,59 +81,62 @@ public:
} }
template<typename T, typename Functor> template<typename T, typename Functor>
void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out) void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat &output_mat) {
{ input_mat.copyTo(output_mat);
data.copyTo(out);
const int ndims = data.dims; const int ndims = input_mat.dims;
const int* shape = data.size.p;
const size_t* step = data.step.p;
const int* ind_shape = indices.size.p; const auto &input_mat_shape = shape(input_mat);
const size_t* ind_step = indices.step.p; std::vector<size_t> input_mat_step(ndims);
size_t inp_offset = 0; const auto &indices_mat_shape = shape(indices_mat);
size_t ind_offset = 0; std::vector<size_t> indices_mat_step(ndims);
const T* p_index = indices.ptr<const T>();
const T* p_update = updates.ptr<const T>();
T* p_out = out.ptr<T>();
size_t total = indices.total(); for (int i = 0; i < ndims; i++) {
input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
indices_mat_step[i] = static_cast<size_t>(indices_mat.step.p[i] / sizeof(T));
}
int j, offset_at_idx, index; auto fn = [&](const Range &r) {
size_t t, idx; size_t input_offset = 0, indices_offset = 0;
for (size_t i = 0; i < total; i++)
{ int indices_index, index;
t = i; size_t axis_offset, tmp_index, j_index;
inp_offset = 0; for (int i = r.start; i < r.end; i++) {
ind_offset = 0; const T* indices = indices_mat.ptr<const T>();
int offset_at_axis = 0; const T* updates = updates_mat.ptr<const T>();
for (j = ndims - 1; j >= 0; j--) T* output = output_mat.ptr<T>();
{
idx = t / ind_shape[j]; input_offset = 0;
offset_at_idx = (int)(t - idx * ind_shape[j]); indices_offset = 0;
ind_offset += offset_at_idx * ind_step[j]; indices_index = i;
inp_offset += offset_at_idx * step[j]; axis_offset = 0;
t = idx; for (int j = ndims - 1; j >= 0; j--) {
if (j == axis) tmp_index = indices_index / indices_mat_shape[j];
{ j_index = (size_t)(indices_index - tmp_index * indices_mat_shape[j]);
offset_at_axis = offset_at_idx * step[j]; input_offset += j_index * input_mat_step[j];
indices_offset += j_index * indices_mat_step[j];
indices_index = tmp_index;
if (j == axis) {
axis_offset = j_index * input_mat_step[j];
} }
} }
ind_offset /= sizeof(T);
// get index and overwrite current indices // get index and overwrite current indices
const T* tmp_p_index = p_index + ind_offset; index = static_cast<int>(*(indices + indices_offset));
index = (int)(*tmp_p_index); index = (index + input_mat_shape[axis]) % input_mat_shape[axis];
CV_Assert(index < shape[axis] && index > -shape[axis]); CV_Assert(index < input_mat_shape[axis] && index >= 0);
input_offset = input_offset - axis_offset + index * input_mat_step[axis];
inp_offset = inp_offset - offset_at_axis + ((index + shape[axis]) % shape[axis]) * step[axis]; updates += indices_offset;
inp_offset /= sizeof(T); output += input_offset;
*output = reduce_operation(*output, *updates);
const T* tmp_p_update = p_update + ind_offset;
T* tmp_p_out = p_out + inp_offset;
*tmp_p_out = rd(*tmp_p_out, *tmp_p_update);
} }
};
size_t total = indices_mat.total();
double nstripes = (size_t)total * ndims * (1 / 1024.0);
parallel_for_(Range(0, total), fn, nstripes);
} }
template<typename... Args> template<typename... Args>