Merge 51a3398a3c into de095fc074

2025-06-07 09:52:40 +08:00 · 2025-06-02 00:37:00 -06:00 · 2025-06-02 00:37:00 -06:00 · 7e1bc48521
commit 7e1bc48521
parent de095fc074 51a3398a3c
5 changed files with 1085 additions and 52 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -269,6 +269,7 @@ endif
 # Rules for src/ccstruct.

 noinst_HEADERS += src/ccstruct/blamer.h
+noinst_HEADERS += src/ccstruct/blob_bounds_calculator.h
 noinst_HEADERS += src/ccstruct/blobbox.h
 noinst_HEADERS += src/ccstruct/blobs.h
 noinst_HEADERS += src/ccstruct/blread.h
@ -312,6 +313,7 @@ noinst_HEADERS += src/ccstruct/params_training_featdef.h
 endif

 libtesseract_la_SOURCES += src/ccstruct/blamer.cpp
+libtesseract_la_SOURCES += src/ccstruct/blob_bounds_calculator.cpp
 libtesseract_la_SOURCES += src/ccstruct/blobbox.cpp
 libtesseract_la_SOURCES += src/ccstruct/blobs.cpp
 libtesseract_la_SOURCES += src/ccstruct/blread.cpp
@ -1176,6 +1178,7 @@ if !DISABLED_LEGACY_ENGINE
 check_PROGRAMS += bitvector_test
 endif # !DISABLED_LEGACY_ENGINE
 endif # ENABLE_TRAINING
+check_PROGRAMS += blob_bounds_calculator_test
 check_PROGRAMS += cleanapi_test
 check_PROGRAMS += colpartition_test
 if ENABLE_TRAINING
@ -1288,6 +1291,10 @@ bitvector_test_CPPFLAGS = $(unittest_CPPFLAGS)
 bitvector_test_LDADD = $(TRAINING_LIBS)
 endif # !DISABLED_LEGACY_ENGINE

+blob_bounds_calculator_test_SOURCES = unittest/blob_bounds_calculator_test.cc
+blob_bounds_calculator_test_CPPFLAGS = $(unittest_CPPFLAGS)
+blob_bounds_calculator_test_LDADD = $(TESS_LIBS)
+
 cleanapi_test_SOURCES = unittest/cleanapi_test.cc
 cleanapi_test_CPPFLAGS = $(unittest_CPPFLAGS)
 cleanapi_test_LDADD = $(TESS_LIBS)
--- a/src/ccstruct/blob_bounds_calculator.cpp
+++ b/src/ccstruct/blob_bounds_calculator.cpp
@ -0,0 +1,491 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blob_bounds_calculator.h
+// Description: Module for calculation of blob bounds from LSTM data
+// Author:      Povilas Kanapickas
+//
+// (C) Copyright 2022, Povilas Kanapickas <povilas@radix.lt>
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "blob_bounds_calculator.h"
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+
+namespace tesseract {
+
+std::ostream& operator<<(std::ostream& out, const CharBoundaryByBoxIndex& d) {
+  out << "CharBoundaryByBoxIndex{ "
+      << d.index << ", "
+      << d.split_index << " " << d.split_count << " }";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const CharacterPlaceDecision& d) {
+  out << "CharacterPlaceDecision{"
+      << " prev_index: " << d.prev_index
+      << " has_boxes: " << d.has_boxes
+      << " begin: " << d.begin
+      << " end: " << d.end
+      << " prev_pos_diff: " << d.prev_pos_diff
+      << " cost: " << d.cost
+      << " }";
+  return out;
+}
+
+void CharacterPlaceDecisions::add_place(unsigned prev_index, bool has_boxes,
+                                        CharBoundaryByBoxIndex begin,
+                                        CharBoundaryByBoxIndex end,
+                                        double prev_pos_diff,
+                                        double cost, double max_cost_diff) {
+  if (cost > min_cost + max_cost_diff) {
+    return;
+  }
+
+  int replace_existing_decision_index = -1;
+  for (std::size_t i = 0; i < decisions.size(); ++i) {
+    if (decisions[i].end == end) {
+      if (cost < decisions[i].cost) {
+        replace_existing_decision_index = i;
+        break;
+      } else {
+        // existing decision is better
+        return;
+      }
+    }
+  }
+
+  CharacterPlaceDecision new_decision{prev_index, has_boxes, begin, end,
+                                      prev_pos_diff, cost};
+  if (replace_existing_decision_index >= 0) {
+      decisions[replace_existing_decision_index] = new_decision;
+  } else {
+      decisions.push_back(new_decision);
+  }
+
+  if (cost < min_cost) {
+    min_cost = cost;
+
+    // Remove all decisions that no longer satisfy maximum cost difference
+    // requirement.
+    auto last_it = std::remove_if(decisions.begin(), decisions.end(),
+                                  [=](const auto& d) {
+      return d.cost > min_cost + max_cost_diff;
+    });
+    decisions.erase(last_it, decisions.end());
+  }
+}
+
+bool CharacterBoundaries::operator==(const CharacterBoundaries& other) const {
+  return begin_x == other.begin_x &&
+          begin_box_index == other.begin_box_index &&
+          end_x == other.end_x &&
+          end_box_index == other.end_box_index;
+}
+
+std::ostream& operator<<(std::ostream& out, const CharacterBoundaries& bounds) {
+  out << "CharacterBoundaries{" << bounds.begin_x << ", "
+      << bounds.begin_box_index << ", "
+      << bounds.end_x << ", "
+      << bounds.end_box_index << "}";
+  return out;
+}
+
+BoxBoundariesCalculator::BoxBoundariesCalculator(
+    const std::vector<BoxBoundaries>& bounds,
+    const BoxBoundariesCalculatorConfig& config) :
+  bounds_{bounds},
+  config_{config}
+{
+  if (!bounds_.empty()) {
+    double width_sum = 0;
+    for (const auto& b : bounds) {
+      width_sum += b.end - b.begin;
+    }
+    average_box_width_ = width_sum / static_cast<double>(bounds.size());
+  }
+}
+
+std::vector<CharacterBoundaries>
+  BoxBoundariesCalculator::calculate_bounds(const std::vector<BoxBoundaries>& symbols)
+{
+  std::vector<CharacterPlaceDecisions> decisions;
+  decisions.resize(symbols.size());
+
+  // The initial state
+  CharacterPlaceDecisions init_decisions;
+  init_decisions.add_place(0, true, {0, 0, 0}, {0, 0, 0}, 0, 0,
+                           config_.max_character_cost_diff);
+
+  for (std::size_t is = 0; is != symbols.size(); ++is) {
+    const auto& symbol = symbols[is];
+    const auto& prev_decisions = is == 0 ? init_decisions : decisions[is - 1];
+    auto& next_decisions = decisions[is];
+
+    auto [symbol_min_box, symbol_max_box] = possible_boxes_for_symbol(symbol);
+
+    unsigned prev_farthest_index = farthest_decision_index(prev_decisions);
+    const auto& prev_farthest_decision =
+        prev_decisions.decisions[prev_farthest_index];
+
+    if (symbol_min_box == symbol_max_box) {
+      // There are no boxes for the current symbol. Select the previous
+      // decision which went farthest and was at box boundary.
+      //
+      // We ignore everything that affects the cost for this symbol because the
+      // cost will be the same for all decision paths, thus will not affect
+      // which decision path is ultimately selected.
+      auto new_cost = prev_farthest_decision.cost +
+          config_.symbol_with_no_box_cost;
+
+      // We reset prev_pos_diff as we are effectively starting over.
+      next_decisions.add_place(prev_farthest_index, false, {{}, 0, 0},
+                               prev_farthest_decision.end,
+                               0, new_cost,
+                               config_.max_character_cost_diff);
+      continue;
+    }
+
+    if (prev_farthest_decision.end.index < symbol_min_box) {
+      // There are boxes that can't be attributed to any of the symbols because
+      // they are too far away. In this case we pick the previous decision path
+      // that went farthest and force the first box to be attributed to the
+      // symbol.
+      //
+      // We ignore everything that affects the cost for this symbol because the
+      // cost will be the same for all decision paths, thus will not affect
+      // which decision path is ultimately selected.
+
+      auto boxes_with_no_symbols =
+          symbol_min_box - prev_farthest_decision.end.index;
+
+      auto new_cost = prev_farthest_decision.cost +
+          config_.box_with_no_symbol_cost * boxes_with_no_symbols;
+
+      // We reset prev_pos_diff as we are effectively starting over.
+      try_decisions_from_prev_decision(next_decisions, prev_farthest_index,
+                                       {symbol_min_box, 0, 0},
+                                       0, new_cost,
+                                       symbol, symbol_max_box);
+      continue;
+    }
+
+    for (std::size_t i_d = 0; i_d < prev_decisions.decisions.size(); ++i_d) {
+      const auto& prev_decision = prev_decisions.decisions[i_d];
+      try_decisions_from_prev_decision(next_decisions, i_d,
+                                       prev_decision.end,
+                                       prev_decision.prev_pos_diff,
+                                       prev_decision.cost,
+                                       symbol, symbol_max_box);
+    }
+  }
+
+  add_costs_for_remaining_boxes(decisions.back());
+  auto best_decision_path = pick_best_decision_path(decisions);
+  fix_decisions_split_count(best_decision_path);
+  return decisions_to_results(symbols, best_decision_path);
+}
+
+void BoxBoundariesCalculator::try_decisions_from_prev_decision(
+    CharacterPlaceDecisions& next_decisions,
+    unsigned prev_decision_index,
+    CharBoundaryByBoxIndex start_bound,
+    double prev_decision_pos_diff,
+    double prev_decision_cost,
+    const BoxBoundaries& symbol, unsigned symbol_max_box)
+{
+  if (start_bound.split_index > 0) {
+    // attempt to split the start box once again
+    try_decision_from_prev_decision(next_decisions, prev_decision_index,
+                                    start_bound,
+                                    {start_bound.index,
+                                     start_bound.split_index + 1,
+                                     start_bound.split_count + 1},
+                                    prev_decision_pos_diff, prev_decision_cost,
+                                    symbol);
+    // attempt to take the remaining split of the start box
+    try_decision_from_prev_decision(next_decisions, prev_decision_index,
+                                    start_bound, {start_bound.index, 0, 0},
+                                    prev_decision_pos_diff, prev_decision_cost,
+                                    symbol);
+  }
+  for (unsigned end_box = start_bound.index + 1;
+       end_box <= symbol_max_box; ++end_box) {
+    // try one or more full boxes
+    try_decision_from_prev_decision(next_decisions, prev_decision_index,
+                                    start_bound, {end_box, 0, 0},
+                                    prev_decision_pos_diff, prev_decision_cost,
+                                    symbol);
+    // try zero or more full boxes and a split box
+    try_decision_from_prev_decision(next_decisions, prev_decision_index,
+                                    start_bound, {end_box, 1, 2},
+                                    prev_decision_pos_diff, prev_decision_cost,
+                                    symbol);
+  }
+}
+
+void BoxBoundariesCalculator::try_decision_from_prev_decision(
+    CharacterPlaceDecisions& next_decisions,
+    unsigned prev_decision_index,
+    CharBoundaryByBoxIndex start_bound, CharBoundaryByBoxIndex end_bound,
+    double prev_decision_pos_diff,
+    double prev_decision_cost,
+    const BoxBoundaries& symbol)
+{
+  // The following computes the additional cost of the decision. The
+  // following rules are used:
+  //
+  //  - The center of the resulting merged boxes that we assign to the symbol
+  //    is just the middle between the start and end boundaries. We don't use
+  //    anything like weighted averages because presumably the boxes actually
+  //    represent a single symbol and were split into parts due to bad quality
+  //    input or a segmenter error. Instead we just consider whole area as a
+  //    single box.
+  //
+  //  - In case of split box, the boundary position is computed according to
+  //    the currently known split factor without taking into account that
+  //    future decisions may split the box further. In theory we could go back
+  //    to previous decisions and adjust the cost, but this is not currently
+  //    implemented.
+  double cost = prev_decision_cost;
+
+  bool is_split = end_bound.split_index != 0;
+  if (is_split) {
+    cost += config_.split_cost;
+  }
+
+  unsigned merge_count = end_bound.index - start_bound.index;
+  if (start_bound.split_index == 0) {
+    merge_count--;
+  }
+
+  cost += config_.merge_cost * merge_count;
+
+  double merged_box_center = (get_box_pos_begin(start_bound) +
+                              get_box_pos_end(end_bound)) / 2;
+  double symbol_center = symbol.middle();
+
+  double pos_diff = symbol_center - merged_box_center;
+  double pos_diff_for_cost = 0;
+
+  if (pos_diff < 0 && pos_diff < prev_decision_pos_diff) {
+    if (prev_decision_pos_diff < 0) {
+      pos_diff_for_cost = prev_decision_pos_diff - pos_diff;
+    } else {
+      pos_diff_for_cost = -pos_diff;
+    }
+  }
+
+  if (pos_diff > 0 && pos_diff > prev_decision_pos_diff) {
+    if (prev_decision_pos_diff > 0) {
+      pos_diff_for_cost = pos_diff - prev_decision_pos_diff;
+    } else {
+      pos_diff_for_cost = pos_diff;
+    }
+  }
+
+  cost += config_.pos_diff_cost * pos_diff_for_cost / average_box_width_;
+
+  next_decisions.add_place(prev_decision_index, true, start_bound, end_bound,
+                           pos_diff, cost, config_.max_character_cost_diff);
+}
+
+
+double BoxBoundariesCalculator::get_box_pos_begin(CharBoundaryByBoxIndex bound)
+{
+  if (bound.split_index == 0) {
+    return bounds_[bound.index].begin;
+  }
+  assert(bound.index > 0);
+  return get_box_split_pos(bounds_[bound.index - 1],
+                           bound.split_index, bound.split_count);
+}
+
+double BoxBoundariesCalculator::get_box_pos_end(CharBoundaryByBoxIndex bound)
+{
+  assert(bound.index > 0);
+
+  if (bound.split_index == 0) {
+    return bounds_[bound.index - 1].end;
+  }
+  return get_box_split_pos(bounds_[bound.index - 1],
+                           bound.split_index, bound.split_count);
+}
+
+
+int BoxBoundariesCalculator::farthest_decision_index(
+    const CharacterPlaceDecisions& decisions)
+{
+    unsigned best_decision = 0;
+    unsigned max_box_index = 0;
+    double best_decision_cost = std::numeric_limits<double>::infinity();
+
+    for (std::size_t i = 0; i < decisions.decisions.size(); ++i) {
+      const auto& decision = decisions.decisions[i];
+
+      if (decision.end.split_index == 0) {
+        if ((decision.end.index == max_box_index &&
+             decision.cost < best_decision_cost) ||
+            decision.end.index < max_box_index) {
+          max_box_index = decision.end.index;
+          best_decision_cost = decision.cost;
+          best_decision = i;
+        }
+      }
+    }
+    return best_decision;
+}
+
+std::pair<unsigned, unsigned>
+  BoxBoundariesCalculator::possible_boxes_for_symbol(const BoxBoundaries& symbol)
+{
+    auto min = symbol.begin - config_.max_pos_diff * average_box_width_;
+    auto max = symbol.end + config_.max_pos_diff * average_box_width_;
+
+    auto range_begin = std::partition_point(bounds_.begin(), bounds_.end(),
+                                            [min](const auto& b){
+      return b.middle() < min;
+    });
+
+    auto range_end = std::partition_point(range_begin, bounds_.end(),
+                                          [max](const auto& b){
+      return b.middle() < max;
+    });
+
+    if (range_begin == bounds_.end()) {
+        return { 0, 0 };
+    }
+    return { std::distance(bounds_.begin(), range_begin),
+             std::distance(bounds_.begin(), range_end) };
+}
+
+void BoxBoundariesCalculator::add_costs_for_remaining_boxes(
+    CharacterPlaceDecisions& decisions) {
+
+  for (auto& decision : decisions.decisions) {
+    if (decision.end.split_index != 0) {
+      // We don't care about decisions that don't end on a box boundary.
+      continue;
+    }
+    assert(decision.end.index > 0);
+
+    auto unused_boxes = bounds_.size() - decision.end.index;
+    decision.cost += unused_boxes * config_.box_with_no_symbol_cost;
+  }
+}
+
+std::vector<CharacterPlaceDecision>
+  BoxBoundariesCalculator::pick_best_decision_path(
+    std::vector<CharacterPlaceDecisions>& decisions) {
+
+  std::vector<CharacterPlaceDecision> result;
+  result.resize(decisions.size());
+
+  unsigned next_best_decision = get_best_end_decision(decisions.back());
+  for (int i = decisions.size(); i > 0; --i) {
+    int curr_index = i - 1;
+    const auto& curr_decisions = decisions[curr_index];
+    const auto& curr_best_decision = curr_decisions.decisions[next_best_decision];
+    next_best_decision = curr_best_decision.prev_index;
+
+    result[curr_index] = curr_best_decision;
+  }
+
+  return result;
+}
+
+void BoxBoundariesCalculator::fix_decisions_split_count(
+    std::vector<CharacterPlaceDecision>& decisions) {
+  unsigned last_box_index = std::numeric_limits<unsigned>::max();
+  unsigned last_box_split_count = 0;
+
+  auto adjust_index = [&](CharBoundaryByBoxIndex& index) {
+    // The box indexes are always increasing and the last index with nonzero
+    // split_count contains the largest split_count that we must apply to the
+    // rest of indexes with nonzero split_count and the same box index.
+    // Note that we iterate backwards in the loop below, so the order reverses
+    // here.
+    if (index.index == last_box_index) {
+      if (index.split_count != 0) {
+        last_box_split_count = index.split_count;
+      }
+      index.split_count = last_box_split_count;
+    } else {
+      last_box_index = index.index;
+      last_box_split_count = index.split_count;
+    }
+  };
+
+  for (auto it = decisions.rbegin(); it != decisions.rend(); it++) {
+    if (it->has_boxes) {
+      adjust_index(it->end);
+      adjust_index(it->begin);
+    }
+  }
+}
+
+std::vector<CharacterBoundaries> BoxBoundariesCalculator::decisions_to_results(
+    const std::vector<BoxBoundaries>& symbols,
+    const std::vector<CharacterPlaceDecision>& decisions)
+{
+  std::vector<CharacterBoundaries> results;
+  results.resize(symbols.size());
+
+  for (int i = decisions.size(); i > 0; --i) {
+    int curr_index = i - 1;
+    const auto& decision = decisions[curr_index];
+    const auto& symbol = symbols[curr_index];
+
+    if (!decision.has_boxes) {
+      results[curr_index] = CharacterBoundaries{symbol.begin, 0, symbol.end, 0};
+      continue;
+    }
+
+    // The result is in terms of boxes that are at least partially assigned to
+    // characters. Decisions store bounds which need adjustment in case of
+    // split boxes.
+    auto begin_index = decision.begin.index;
+    if (decision.begin.split_count > 0) {
+      begin_index--;
+    }
+
+    results[curr_index] = CharacterBoundaries{
+            static_cast<int>(get_box_pos_begin(decision.begin)),
+            begin_index,
+            static_cast<int>(get_box_pos_end(decision.end)),
+            decision.end.index};
+  }
+
+  return results;
+}
+
+int BoxBoundariesCalculator::get_best_end_decision(
+    const CharacterPlaceDecisions& decisions) {
+  assert(!decisions.decisions.empty());
+
+  unsigned best_decision = 0;
+  double min_cost = std::numeric_limits<double>::infinity();
+
+  for (unsigned i = 0; i < decisions.decisions.size(); ++i) {
+    const auto& decision = decisions.decisions[i];
+    if (decision.end.split_index != 0)
+      continue;
+    if (decision.cost < min_cost) {
+      best_decision = i;
+      min_cost = decision.cost;
+    }
+  }
+
+  return best_decision;
+}
+
+} // namespace tesseract
--- a/src/ccstruct/blob_bounds_calculator.h
+++ b/src/ccstruct/blob_bounds_calculator.h
@ -0,0 +1,278 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blob_bounds_calculator.h
+// Description: Module for calculation of blob bounds from LSTM data
+// Author:      Povilas Kanapickas
+//
+// (C) Copyright 2022, Povilas Kanapickas <povilas@radix.lt>
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H
+#define TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H
+
+#include <iosfwd>
+#include <limits>
+#include <optional>
+#include <vector>
+
+namespace tesseract {
+
+/* This file contains an implementation of an algorithm for improving character
+   positions when using LSTM models. LSTM model output produces only approximate
+   character positions without boundary data. Matching it to the blobs that
+   comprise the characters is non-trivial task, because the character positions
+   in the LTSM output have drift that is large enough for simple algorithms such
+   "pick nearest blobs" to produce large amounts of errors.
+
+   It can be noticed that while LSTM model output produces only approximate
+   character positions, the regular segmenter is pretty good. Most of the blob
+   boundaries correspond to boundaries of characters and most significant errors
+   are occasional blobs that correspond to multiple characters or multiple blobs
+   that correspond to a single character.
+
+   Thus the basic idea of the algorithm is to treat the output of the regular
+   segmenter as a template to which LSTM model output is matched. The selection
+   of best match is done by assigning each unwanted property a cost and
+   then minimizing the total cost of the solution. The algorithm uses the
+   following costs:
+
+    - cost for merging multiple blobs to represent a character
+    - cost for splitting a blob to represent multiple characters
+    - cost for difference between the positions of the blobs and characters
+      that they are matched to.
+
+   The cost of difference between positions is computed not by simply
+   accumulating the sum of all position differences, but by only taking into
+   account additional difference of each character compared to previous
+   character. This way the algorithm does not attempt to "optimize" out of
+   place characters by adding unneeded blob merges and splits.
+
+   The optimization problem is solved by dynamic programming techniques by
+   noticing that assigning specific blobs to a character leaves us with a
+   slightly smaller problem.
+
+   The approach is to place the first character in all potential positions
+   and record the outcomes. Then for each of these outcomes attempts are made
+   to place the second character at all potential positions and so on.
+   Whenever there are multiple decision paths to arrive to a situation when the
+   end of a specific character is at the same position, the path with the
+   lowest cost is picked and others are ignored.
+*/
+
+// Represents a character boundary in terms of index of a box in a list and
+// potentially partition within that box.
+struct CharBoundaryByBoxIndex {
+  // The index of the box following the boundary.
+  unsigned index = 0;
+
+  // The location of the boundary within the box. split_count == 0 means that
+  // the boundary is just before the box. Otherwise, the location is
+  // (split_index / split_count) position within the preceding box.
+  unsigned split_index = 0;
+  unsigned split_count = 0;
+
+  bool operator==(const CharBoundaryByBoxIndex& other) const {
+    return index == other.index &&
+        split_index == other.split_index &&
+        split_count == other.split_count;
+  }
+
+  bool operator!=(const CharBoundaryByBoxIndex& other) const {
+    return !(*this == other);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const CharBoundaryByBoxIndex& d);
+
+
+// Represents a placement of a specific character at specific location.
+struct CharacterPlaceDecision {
+  // Index of the placement decision of the previous character.
+  unsigned prev_index;
+  // Whether the character had any boxes assigned to it. If not, then the
+  // data stored in `begin` in not defined.
+  bool has_boxes = false;
+  // Placement of the start of a character in the input box list.
+  CharBoundaryByBoxIndex begin;
+  // Placement of the end of a character in the input box list.
+  CharBoundaryByBoxIndex end;
+  // The difference of positions between the center of the previous character
+  // and the center of the assigned boxes
+  double prev_pos_diff = 0;
+  // The cost incurred so far
+  double cost = 0;
+};
+
+std::ostream& operator<<(std::ostream& out, const CharacterPlaceDecision& d);
+
+
+// Represents a set of placement decisions for a specific character
+struct CharacterPlaceDecisions {
+  std::vector<CharacterPlaceDecision> decisions;
+  // minimum cost across all decisions
+  double min_cost = std::numeric_limits<double>::infinity();
+
+  // Adds a character placement decision.
+  void add_place(unsigned prev_index, bool has_boxes,
+                 CharBoundaryByBoxIndex begin, CharBoundaryByBoxIndex end,
+                 double prev_pos_diff, double cost, double max_cost_diff);
+};
+
+// Represents bounds of a box in X direction
+struct BoxBoundaries {
+  int begin = 0;
+  int end = 0;
+
+  double middle() const { return (double(begin) + end) / 2; }
+};
+
+
+// Represents resulting character boundaries. The exact X positions are
+// provided as well as which input blobs the character corresponds to, which
+// allows computing correct boundaries in the Y axis.
+struct CharacterBoundaries {
+  int begin_x = 0;
+
+  // Inclusive index of the beginning box.
+  unsigned begin_box_index = 0;
+
+  int end_x = 0;
+
+  // Exclusive index of the ending box. If box data is invalid,
+  // begin_box_index == end_box_index
+  unsigned end_box_index = 0;
+
+  bool operator==(const CharacterBoundaries& other) const;
+};
+
+std::ostream& operator<<(std::ostream& out, const CharacterBoundaries& bounds);
+
+
+struct BoxBoundariesCalculatorConfig
+{
+  // The cost of each merging of two input boxes.
+  double merge_cost = 2;
+
+  // The cost of each split of two input boxes.
+  double split_cost = 2;
+
+  // The cost of each box that is not attributed to any symbol
+  double box_with_no_symbol_cost = 2.2;
+
+  // The cost of each symbol that has no boxes
+  double symbol_with_no_box_cost = 2.2;
+
+  // The cost of difference between the center the symbol and the center of
+  // the input box. This cost is only incurred whenever subsequent character
+  // "moves" in wrong direction. The total cost is computed by multiplying
+  // the multiplier and the difference of positions relative to the average
+  // width of input boxes.
+  double pos_diff_cost = 1;
+
+  // Defines which boxes to potentially consider for symbol. The number is
+  // relative to the average width of input boxes.
+  double max_pos_diff = 2;
+
+  // Defines the maximum difference between minimum and maximum cost for all
+  // placements of a character.
+  double max_character_cost_diff = 5;
+};
+
+// See the description of the algorithm at the top of the file.
+class BoxBoundariesCalculator {
+public:
+  // Constructs the calculator for blob boundaries computed by regular
+  // segmenter.
+  BoxBoundariesCalculator(const std::vector<BoxBoundaries>& bounds,
+                          const BoxBoundariesCalculatorConfig& config);
+
+  // Computes improved character positions given LSTM model output. For the
+  // purposes of character positioning only the center coordinate is used.
+  // The start and end coordinates are used only as a fallback when the data
+  // does not match any input blobs.
+  std::vector<CharacterBoundaries>
+    calculate_bounds(const std::vector<BoxBoundaries>& symbols);
+
+private:
+
+  // This function takes all possible combinations of box boundaries between
+  // start_bound and symbol_max_box, computes the costs of each option and adds
+  // them to next_decisions array. The number of possibilities is approximately
+  // (symbol_max_box - start_bound.index) * 2. The number is twice the number
+  // of available boxes in range because we may want to split each box with
+  // subsequent symbol.
+  void try_decisions_from_prev_decision(CharacterPlaceDecisions& next_decisions,
+                                        unsigned prev_decision_index,
+                                        CharBoundaryByBoxIndex start_bound,
+                                        double prev_decision_pos_diff,
+                                        double prev_decision_cost,
+                                        const BoxBoundaries& symbol,
+                                        unsigned symbol_max_box);
+
+  void try_decision_from_prev_decision(CharacterPlaceDecisions& next_decisions,
+                                       unsigned prev_decision_index,
+                                       CharBoundaryByBoxIndex start_bound,
+                                       CharBoundaryByBoxIndex end_bound,
+                                       double prev_decision_pos_diff,
+                                       double prev_decision_cost,
+                                       const BoxBoundaries& symbol);
+
+  double get_box_pos_begin(CharBoundaryByBoxIndex bound);
+  double get_box_pos_end(CharBoundaryByBoxIndex bound);
+
+  double get_box_split_pos(const BoxBoundaries& b, unsigned split_index,
+                           unsigned split_count)
+  {
+    return b.begin + (b.end - b.begin) * double(split_index) / split_count;
+  }
+
+  static int farthest_decision_index(const CharacterPlaceDecisions& decisions);
+
+  std::pair<unsigned, unsigned>
+    possible_boxes_for_symbol(const BoxBoundaries& symbol);
+
+
+  // Goes through the decisions and adds costs for all boxes that have not
+  // been added to a symbol.
+  void add_costs_for_remaining_boxes(CharacterPlaceDecisions& decisions);
+
+  // Goes through the final decisions and picks full path of the best placement
+  // decision.
+  std::vector<CharacterPlaceDecision> pick_best_decision_path(
+    std::vector<CharacterPlaceDecisions>& decisions);
+
+  // When constructing decisions we didn't care to update split sizes of
+  // blobs when splitting more than once. As a result, splitting a blob into 4
+  // parts splits at 0.5, 0.66 and 0.75 of the blob whereas the correct
+  // splits are at 0.25, 0.5, 0.75. We assume this does not matter when
+  // computing the costs, but for positions of the characters we need to
+  // produce exact results.
+  void fix_decisions_split_count(std::vector<CharacterPlaceDecision>& decisions);
+
+  std::vector<CharacterBoundaries>
+      decisions_to_results(const std::vector<BoxBoundaries>& symbols,
+                           const std::vector<CharacterPlaceDecision>& decisions);
+
+  // Finds the best decision from the final decisions. The best decision is
+  // such that it has minimum cost among decisions that end at an proper box
+  // boundary.
+  static int get_best_end_decision(const CharacterPlaceDecisions& decisions);
+
+private:
+  std::vector<BoxBoundaries> bounds_;
+  BoxBoundariesCalculatorConfig config_;
+  double average_box_width_ = 0;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCSTRUCT_BLOB_BOUNDS_CALCULATOR_H
--- a/src/ccstruct/pageres.cpp
+++ b/src/ccstruct/pageres.cpp
@ -24,6 +24,7 @@
 #include "pageres.h"

 #include "blamer.h"   // for BlamerBundle
+#include "blob_bounds_calculator.h" // for BoxBoundariesCalculator
 #include "blobs.h"    // for TWERD, TBLOB
 #include "boxword.h"  // for BoxWord
 #include "errcode.h"  // for ASSERT_HOST
@ -1273,36 +1274,6 @@ WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res,
  return new_res;
 }

-// Helper computes the boundaries between blobs in the word. The blob bounds
-// are likely very poor, if they come from LSTM, where it only outputs the
-// character at one pixel within it, so we find the midpoints between them.
-static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,
-                            C_BLOB_LIST *next_word_blobs,
-                            std::vector<int> *blob_ends) {
-  C_BLOB_IT blob_it(word.word->cblob_list());
-  for (int length : word.best_state) {
-    // Get the bounding box of the fake blobs
-    TBOX blob_box = blob_it.data()->bounding_box();
-    blob_it.forward();
-    for (int b = 1; b < length; ++b) {
-      blob_box += blob_it.data()->bounding_box();
-      blob_it.forward();
-    }
-    // This blob_box is crap, so for now we are only looking for the
-    // boundaries between them.
-    int blob_end = INT32_MAX;
-    if (!blob_it.at_first() || next_word_blobs != nullptr) {
-      if (blob_it.at_first()) {
-        blob_it.set_to_list(next_word_blobs);
-      }
-      blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
-    }
-    blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
-    blob_ends->push_back(blob_end);
-  }
-  blob_ends->back() = clip_box.right();
-}
-
 // Helper computes the bounds of a word by restricting it to existing words
 // that significantly overlap.
 static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words,
@ -1349,11 +1320,45 @@ static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words,
  return clipped_box;
 }

-// Helper moves the blob from src to dest. If it isn't contained by clip_box,
-// the blob is replaced by a fake that is contained.
-static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it,
-                            const TBOX &clip_box) {
-  C_BLOB *src_blob = src_it->extract();
+// Helper to compute input for BoxBoundariesCalculator
+static std::vector<BoxBoundaries> ComputeFakeWordBlobXBounds(
+    const PointerVector<WERD_RES> &words) {
+
+  std::vector<BoxBoundaries> result;
+
+  for (size_t w = 0; w < words.size(); ++w) {
+    WERD_RES *word_w = words[w];
+
+    C_BLOB_IT blob_it(word_w->word->cblob_list());
+    for (int length : word_w->best_state) {
+      TBOX blob_box = blob_it.data()->bounding_box();
+      blob_it.forward();
+      for (int b = 1; b < length; ++b) {
+        blob_box += blob_it.data()->bounding_box();
+        blob_it.forward();
+      }
+      result.push_back({blob_box.left(), blob_box.right()});
+    }
+  }
+  return result;
+}
+
+// Helper to compute input for BoxBoundariesCalculator
+static std::vector<BoxBoundaries> ComputeBlobXBoundsFromTBOX(
+    const std::vector<TBOX> &boxes) {
+  std::vector<BoxBoundaries> result;
+  result.reserve(boxes.size());
+  for (const auto& box : boxes) {
+    result.push_back({box.left(), box.right()});
+  }
+  return result;
+}
+
+// Helper moves the src_blob to dest. If it isn't contained by clip_box,
+// the blob is replaced by a fake that is contained. The helper takes ownership
+// of the blob.
+static TBOX ClipAndAddBlob(C_BLOB *src_blob, C_BLOB_IT *dest_it,
+                           const TBOX &clip_box) {
  TBOX box = src_blob->bounding_box();
  if (!clip_box.contains(box)) {
    int left =
@ -1372,6 +1377,13 @@ static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it,
  return box;
 }

+// Helper to clip a box only in X direction
+static TBOX ClipBoxX(const TBOX &box, int left, int right) {
+  int clip_left = ClipToRange<int>(box.left(), left, right - 1);
+  int clip_right = ClipToRange<int>(box.right(), left + 1, right);
+  return TBOX(clip_left, box.bottom(), clip_right, box.top());
+}
+
 // Replaces the current WERD/WERD_RES with the given words. The given words
 // contain fake blobs that indicate the position of the characters. These are
 // replaced with real blobs from the current word as much as possible.
@ -1416,21 +1428,31 @@ void PAGE_RES_IT::ReplaceCurrentWord(
    }
  }
  ASSERT_HOST(!wr_it.cycled_list());
-  // Since we only have an estimate of the bounds between blobs, use the blob
-  // x-middle as the determiner of where to put the blobs
+
+  std::vector<TBOX> blob_boxes;
+
  C_BLOB_IT src_b_it(input_word->word->cblob_list());
  src_b_it.sort(&C_BLOB::SortByXMiddle);
+  for (src_b_it.mark_cycle_pt(); !src_b_it.cycled_list(); src_b_it.forward()) {
+    blob_boxes.push_back(src_b_it.data()->bounding_box());
+  }
+  src_b_it.move_to_first();
+
  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
  rej_b_it.sort(&C_BLOB::SortByXMiddle);
+
+  auto fake_blob_bounds = ComputeFakeWordBlobXBounds(*words);
+  BoxBoundariesCalculator calculator{ComputeBlobXBoundsFromTBOX(blob_boxes), {}};
+  auto char_bounds = calculator.calculate_bounds(fake_blob_bounds);
+  size_t char_bounds_i = 0;
+  size_t box_bounds_i = 0;
+  TBOX last_blob_box;
+
  TBOX clip_box;
  for (size_t w = 0; w < words->size(); ++w) {
    WERD_RES *word_w = (*words)[w];
    clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
-    // Compute blob boundaries.
-    std::vector<int> blob_ends;
-    C_BLOB_LIST *next_word_blobs =
-        w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
-    ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
+
    // Remove the fake blobs on the current word, but keep safe for back-up if
    // no blob can be found.
    C_BLOB_LIST fake_blobs;
@ -1441,26 +1463,64 @@ void PAGE_RES_IT::ReplaceCurrentWord(
    C_BLOB_IT dest_it(word_w->word->cblob_list());
    // Build the box word as we move the blobs.
    auto *box_word = new tesseract::BoxWord;
-    for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
-      int end_x = blob_ends[i];
+
+    for (size_t i = 0; i < word_w->best_state.size(); ++i) {
+      const auto& char_bound = char_bounds[char_bounds_i++];
+
      TBOX blob_box;
-      // Add the blobs up to end_x.
-      while (!src_b_it.empty() &&
-             src_b_it.data()->bounding_box().x_middle() < end_x) {
-        blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
-        src_b_it.forward();
+      if (char_bound.begin_box_index != char_bound.end_box_index) {
+        // The box indices in curr_char_bound will always be increasing, thus
+        // we can iterate src_b_it in the same order.
+        while (box_bounds_i < char_bound.begin_box_index) {
+          box_bounds_i++;
+          src_b_it.forward();
+        }
+
+        if (box_bounds_i > char_bound.begin_box_index) {
+          // The blob was split across multiple characters and has already
+          // been extracted for a previous character. We have the bounds
+          // of the blob and can create a fake blob out of it.
+          TBOX fake_box = ClipBoxX(last_blob_box,
+                                   char_bound.begin_x, char_bound.end_x);
+          blob_box += ClipAndAddBlob(C_BLOB::FakeBlob(fake_box),
+                                     &dest_it, clip_box);
+        }
+
+        // Add all blobs that have not yet been assigned to any of the
+        // characters.
+        while (box_bounds_i < char_bound.end_box_index) {
+          auto* src_blob = src_b_it.extract();
+          last_blob_box = src_blob->bounding_box();
+          TBOX inserted_box = ClipAndAddBlob(src_blob, &dest_it, clip_box);
+
+          box_bounds_i++;
+          src_b_it.forward();
+
+          // Note that the blob may be split across multiple characters in
+          // which case we want to clip the box to the part that was "assigned"
+          // to the character.
+          blob_box += ClipBoxX(inserted_box,
+                               char_bound.begin_x, char_bound.end_x);
+        }
      }
+
+      // It's not clear where rejected blobs should be added because by
+      // definition we don't have enough information about them. So we just
+      // add them to whatever character follows.
      while (!rej_b_it.empty() &&
-             rej_b_it.data()->bounding_box().x_middle() < end_x) {
-        blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
+             rej_b_it.data()->bounding_box().x_middle() < char_bound.end_x) {
+        blob_box += ClipAndAddBlob(rej_b_it.extract(), &dest_it, clip_box);
        rej_b_it.forward();
      }
+
      if (blob_box.null_box()) {
        // Use the original box as a back-up.
-        blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
+        blob_box = ClipAndAddBlob(fake_b_it.extract(), &dest_it, clip_box);
      }
      box_word->InsertBox(i, blob_box);
+      fake_b_it.forward();
    }
+
    delete word_w->box_word;
    word_w->box_word = box_word;
    if (!input_word->combination) {
--- a/unittest/blob_bounds_calculator_test.cc
+++ b/unittest/blob_bounds_calculator_test.cc
@ -0,0 +1,197 @@
+// (C) Copyright 2022, Povilas Kanapickas <povilas@radix.lt>.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "blob_bounds_calculator.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+namespace {
+
+BoxBoundariesCalculatorConfig get_default_config() {
+    BoxBoundariesCalculatorConfig config;
+    config.merge_cost = 1;
+    config.split_cost = 1;
+    config.pos_diff_cost = 1;
+    config.max_pos_diff = 2;
+    config.box_with_no_symbol_cost = 2;
+    config.symbol_with_no_box_cost = 2;
+    return config;
+}
+
+} // namespace
+
+TEST(BoxBoundariesCalculatorTest, MatchesExactly) {
+  BoxBoundariesCalculator calc{{{10, 20}, {21, 30}, {31, 40}, {41, 50}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {10, 0, 20, 1},
+    {21, 1, 30, 2},
+    {31, 2, 40, 3},
+    {41, 3, 50, 4}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {10, 20}, {20, 30}, {30, 40}, {40, 50}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, OneMergedInMiddle) {
+  BoxBoundariesCalculator calc{{{10, 20}, {21, 40}, {41, 50}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {10, 0, 20, 1},
+    {21, 1, 30, 2},
+    {30, 1, 40, 2},
+    {41, 2, 50, 3}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {10, 20}, {20, 30}, {30, 40}, {40, 50}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, OneSplit) {
+  BoxBoundariesCalculator calc{{{10, 20}, {21, 25}, {26, 30}, {31, 40}, {41, 50}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {10, 0, 20, 1},
+    {21, 1, 30, 3},
+    {31, 3, 40, 4},
+    {41, 4, 50, 5}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {10, 20}, {20, 30}, {30, 40}, {40, 50}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, ManySplitAtEnd) {
+  BoxBoundariesCalculator calc{
+    {
+      {10, 20}, {21, 30}, {31, 40}, {41, 50}, {51, 60}, {61, 70}
+    },
+    get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {10, 0, 20, 1},
+    {21, 1, 30, 2},
+    {31, 2, 40, 3},
+    {41, 3, 70, 6}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {10, 20}, {20, 30}, {30, 40}, {40, 50}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, ShiftedSymbolPositionsForward) {
+  BoxBoundariesCalculator calc{{{10, 20}, {21, 30}, {31, 40}, {41, 50}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {10, 0, 20, 1},
+    {21, 1, 30, 2},
+    {31, 2, 40, 3},
+    {41, 3, 50, 4}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {15, 25}, {25, 35}, {35, 45}, {45, 55}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, VeryShiftedSymbolPositionsForward) {
+  BoxBoundariesCalculator calc{{{10, 20}, {21, 30}, {31, 40}, {41, 50}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {10, 0, 20, 1},
+    {21, 1, 30, 2},
+    {31, 2, 40, 3},
+    {41, 3, 50, 4}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {25, 35}, {35, 45}, {45, 55}, {55, 65}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, ShiftedSymbolPositionsBackward) {
+  BoxBoundariesCalculator calc{{{110, 120}, {121, 130}, {131, 140}, {141, 150}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {110, 0, 120, 1},
+    {121, 1, 130, 2},
+    {131, 2, 140, 3},
+    {141, 3, 150, 4}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {105, 115}, {115, 125}, {125, 135}, {135, 145}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, VeryShiftedSymbolPositionsBackward) {
+  BoxBoundariesCalculator calc{{{110, 120}, {121, 130}, {131, 140}, {141, 150}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {110, 0, 120, 1},
+    {121, 1, 130, 2},
+    {131, 2, 140, 3},
+    {141, 3, 150, 4}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {95, 105}, {105, 115}, {115, 125}, {125, 135}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, HoleInMiddle) {
+  BoxBoundariesCalculator calc{{{110, 120}, {121, 130}, {131, 140}, {141, 150}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {110, 0, 120, 1},
+    {121, 1, 130, 2},
+    {131, 2, 140, 3},
+    {141, 3, 150, 4}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {105, 115}, {115, 125}, {135, 145}, {145, 155}
+  }));
+}
+
+TEST(BoxBoundariesCalculatorTest, LargeHoleInMiddle) {
+  BoxBoundariesCalculator calc{{{110, 120}, {121, 130}, {131, 140}, {141, 150}},
+                               get_default_config()};
+
+  std::vector<CharacterBoundaries> expected = {
+    {110, 0, 120, 1},
+    {121, 1, 130, 2},
+    {131, 2, 140, 3},
+    {141, 3, 150, 4}
+  };
+
+  ASSERT_EQ(expected, calc.calculate_bounds({
+    {95, 105}, {105, 115}, {145, 155}, {155, 165}
+  }));
+}
+
+} // namespace tesseract