Round output buffers for intSimdMatrix.

In order to allow intSimdMatrix implementations to 'overwrite'
their outputs, ensure that the output buffers are always padded
to the next block size.

This doesn't make any difference yet, but it enables optimisations
further down the line, especially when the biasing is pulled into
the SIMD.
This commit is contained in:
Robin Watts 2020-05-27 12:18:23 +01:00
parent 9dfdac51c6
commit aba1800f69
3 changed files with 18 additions and 3 deletions

View File

@ -132,8 +132,11 @@ void FullyConnected::Forward(bool debug, const NetworkIO& input,
temp_lines.init_to_size(kNumThreads, NetworkScratch::FloatVec());
GenericVector<NetworkScratch::FloatVec> curr_input;
curr_input.init_to_size(kNumThreads, NetworkScratch::FloatVec());
int ro = no_;
if (IntSimdMatrix::intSimdMatrix)
ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);
for (int i = 0; i < kNumThreads; ++i) {
temp_lines[i].Init(no_, scratch);
temp_lines[i].Init(no_, ro, scratch);
curr_input[i].Init(ni_, scratch);
}
#ifdef _OPENMP

View File

@ -264,7 +264,10 @@ void LSTM::Forward(bool debug, const NetworkIO& input,
ResizeForward(input);
// Temporary storage of forward computation for each gate.
NetworkScratch::FloatVec temp_lines[WT_COUNT];
for (auto & temp_line : temp_lines) temp_line.Init(ns_, scratch);
int ro = ns_;
if (source_.int_mode() && IntSimdMatrix::intSimdMatrix)
ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);
for (auto & temp_line : temp_lines) temp_line.Init(ns_, ro, scratch);
// Single timestep buffers for the current/recurrent output and state.
NetworkScratch::FloatVec curr_state, curr_output;
curr_state.Init(ns_, scratch);

View File

@ -144,15 +144,24 @@ class NetworkScratch {
if (scratch_space_ != nullptr) scratch_space_->vec_stack_.Return(vec_);
}
void Init(int size, NetworkScratch* scratch) {
void Init(int size, int reserve, NetworkScratch* scratch) {
if (scratch_space_ != nullptr && vec_ != nullptr)
scratch_space_->vec_stack_.Return(vec_);
scratch_space_ = scratch;
vec_ = scratch_space_->vec_stack_.Borrow();
// Abuse vec_ here; first resize to 'reserve', which is larger
// than 'size' (i.e. it's size rounded up) then resize down again
// to the desired size. This assumes that the implementation does
// not shrink the storage on a resize.
vec_->resize_no_init(reserve);
vec_->resize_no_init(size);
data_ = &(*vec_)[0];
}
void Init(int size, NetworkScratch *scratch) {
Init(size, size, scratch);
}
// Use the cast operator instead of operator[] so the FloatVec can be used
// as a double* argument to a function call.
operator double*() const { return data_; }