Round output buffers for intSimdMatrix.

In order to allow intSimdMatrix implementations to 'overwrite' their outputs, ensure that the output buffers are always padded to the next block size. This doesn't make any difference yet, but it enables optimisations further down the line, especially when the biasing is pulled into the SIMD.
2025-01-22 09:53:03 +08:00 · 2020-05-27 12:18:23 +01:00 · 2020-05-27 12:18:23 +01:00 · aba1800f69
commit aba1800f69
parent 9dfdac51c6
3 changed files with 18 additions and 3 deletions
--- a/src/lstm/fullyconnected.cpp
+++ b/src/lstm/fullyconnected.cpp
@ -132,8 +132,11 @@ void FullyConnected::Forward(bool debug, const NetworkIO& input,
  temp_lines.init_to_size(kNumThreads, NetworkScratch::FloatVec());
  GenericVector<NetworkScratch::FloatVec> curr_input;
  curr_input.init_to_size(kNumThreads, NetworkScratch::FloatVec());
+  int ro = no_;
+  if (IntSimdMatrix::intSimdMatrix)
+    ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);
  for (int i = 0; i < kNumThreads; ++i) {
-    temp_lines[i].Init(no_, scratch);
+    temp_lines[i].Init(no_, ro, scratch);
    curr_input[i].Init(ni_, scratch);
  }
 #ifdef _OPENMP
--- a/src/lstm/lstm.cpp
+++ b/src/lstm/lstm.cpp
@ -264,7 +264,10 @@ void LSTM::Forward(bool debug, const NetworkIO& input,
  ResizeForward(input);
  // Temporary storage of forward computation for each gate.
  NetworkScratch::FloatVec temp_lines[WT_COUNT];
-  for (auto & temp_line : temp_lines) temp_line.Init(ns_, scratch);
+  int ro = ns_;
+  if (source_.int_mode() && IntSimdMatrix::intSimdMatrix)
+    ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);
+  for (auto & temp_line : temp_lines) temp_line.Init(ns_, ro, scratch);
  // Single timestep buffers for the current/recurrent output and state.
  NetworkScratch::FloatVec curr_state, curr_output;
  curr_state.Init(ns_, scratch);
--- a/src/lstm/networkscratch.h
+++ b/src/lstm/networkscratch.h
@ -144,15 +144,24 @@ class NetworkScratch {
      if (scratch_space_ != nullptr) scratch_space_->vec_stack_.Return(vec_);
    }

-    void Init(int size, NetworkScratch* scratch) {
+    void Init(int size, int reserve, NetworkScratch* scratch) {
      if (scratch_space_ != nullptr && vec_ != nullptr)
        scratch_space_->vec_stack_.Return(vec_);
      scratch_space_ = scratch;
      vec_ = scratch_space_->vec_stack_.Borrow();
+      // Abuse vec_ here; first resize to 'reserve', which is larger
+      // than 'size' (i.e. it's size rounded up) then resize down again
+      // to the desired size. This assumes that the implementation does
+      // not shrink the storage on a resize.
+      vec_->resize_no_init(reserve);
      vec_->resize_no_init(size);
      data_ = &(*vec_)[0];
    }

+    void Init(int size, NetworkScratch *scratch) {
+      Init(size, size, scratch);
+    }
+
    // Use the cast operator instead of operator[] so the FloatVec can be used
    // as a double* argument to a function call.
    operator double*() const { return data_; }