merging with tesseract master in order to create a pull request

2024-12-03 00:49:01 +08:00 · 2021-03-15 17:02:19 +01:00 · 2021-03-15 17:02:19 +01:00 · 594a000ecd
commit 594a000ecd
parent b2ed8038d1 c676d5bcff
664 changed files with 80036 additions and 87703 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,5 +1,4 @@
---
-BasedOnStyle:  Google
+BasedOnStyle: Google
 # Only merge empty functions.
 AllowShortFunctionsOnASingleLine: Empty
 # Do not allow short if statements.
@ -7,3 +6,16 @@ AllowShortIfStatementsOnASingleLine: false
 # Enforce always the same pointer alignment.
 DerivePointerAlignment: false
 IndentPPDirectives: AfterHash
+
+PointerAlignment: Right
+IncludeBlocks: Preserve
+FixNamespaceComments: true
+ColumnLimit: 100
+IndentWidth: 2
+#IndentAccessModifiers: false # not accepted atm
+AccessModifierOffset: -2 # set to minus IndentWidth (-IndentWidth)
+SpacesBeforeTrailingComments: 1
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+BreakConstructorInitializers: BeforeComma
+#ConstructorInitializerAllOnOneLineOrOnePerLine: false
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+* text=auto
--- a/.github/workflows/autotools.yml
+++ b/.github/workflows/autotools.yml
@ -0,0 +1,320 @@
+name: autotools
+# autotools build of tesseract and training tools on ubuntu, macos homebrew and macports.
+# run command line tests, basicapitest and unittests.
+on:
+  #push:
+  schedule:
+    - cron: 0 20 * * *
+jobs:
+
+# ============================================================================================
+
+  linux:
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - { name: ubuntu-18.04-clang-7-autotools, os: ubuntu-18.04, cxx: clang++-7 }
+          - { name: ubuntu-18.04-clang-8-autotools, os: ubuntu-18.04, cxx: clang++-8 } #installed
+          - { name: ubuntu-18.04-clang-9-autotools, os: ubuntu-18.04, cxx: clang++-9 } #installed
+          - { name: ubuntu-18.04-clang-10-autotools, os: ubuntu-18.04, cxx: clang++-10 }
+
+          - { name: ubuntu-18.04-gcc-7-autotools, os: ubuntu-18.04, cxx: g++-7 } #installed
+          - { name: ubuntu-18.04-gcc-8-autotools, os: ubuntu-18.04, cxx: g++-8 } #installed
+          - { name: ubuntu-18.04-gcc-9-autotools, os: ubuntu-18.04, cxx: g++-9 } #installed
+          - { name: ubuntu-18.04-gcc-10-autotools, os: ubuntu-18.04, cxx: g++-10 } #installed
+
+          - { name: ubuntu-20.04-clang-7-autotools, os: ubuntu-20.04, cxx: clang++-7 }
+          - { name: ubuntu-20.04-clang-8-autotools, os: ubuntu-20.04, cxx: clang++-8 } #installed
+          - { name: ubuntu-20.04-clang-9-autotools, os: ubuntu-20.04, cxx: clang++-9 } #installed
+          - { name: ubuntu-20.04-clang-10-autotools, os: ubuntu-20.04, cxx: clang++-10 } #installed
+
+          - { name: ubuntu-20.04-gcc-7-autotools, os: ubuntu-20.04, cxx: g++-7 } #installed
+          - { name: ubuntu-20.04-gcc-8-autotools, os: ubuntu-20.04, cxx: g++-8 } #installed
+          - { name: ubuntu-20.04-gcc-9-autotools, os: ubuntu-20.04, cxx: g++-9 } #installed
+          - { name: ubuntu-20.04-gcc-10-autotools, os: ubuntu-20.04, cxx: g++-10 } #installed
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Download fonts, tessdata and langdata required for tests
+      run: |
+           git clone https://github.com/egorpugin/tessdata tessdata_unittest
+           cp tessdata_unittest/fonts/* test/testing/
+           mv tessdata_unittest/* ../
+
+    - name: Install Compiler
+      run: |
+           sudo apt-get install -y ${{ matrix.config.cxx }}
+
+    - name: Install dependencies
+      run: |
+           sudo apt-get install autoconf-archive libleptonica-dev -y
+           sudo apt-get install libicu-dev libpango1.0-dev libcairo2-dev -y
+           sudo apt-get install cabextract libarchive-dev -y
+           sudo apt-get install libcurl4-openssl-dev libcurl4 curl -y
+
+    - name: Setup Tesseract
+      run: |
+           mkdir -p m4
+           ./autogen.sh
+
+    - name: Configure Tesseract
+      run: |
+           ./configure '--disable-shared' '--disable-openmp' '--disable-doc' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'
+
+    - name: Make and Install Tesseract
+      run: |
+           make -j 8
+           sudo make install install
+
+    - name: Make and Install Training Tools
+      run: |
+           make training -j 8
+           sudo make install training-install
+
+    - name: Make and run Unit Tests
+      run: |
+           make check
+
+    - name: Display Version for tesseract, lstmtraining, text2image
+      run: |
+           tesseract -v
+           lstmtraining -v
+           text2image -v
+      if: success() || failure()
+
+    - name: List languages in different test tessdata-dir
+      run: |
+           tesseract  --list-langs --tessdata-dir ../tessdata
+           tesseract  --list-langs --tessdata-dir ../tessdata_best
+           tesseract  --list-langs --tessdata-dir ../tessdata_fast
+
+    - name: Run Tesseract on test images in different languages
+      run: |
+           tesseract test/testing/phototest.tif - --oem 1  --tessdata-dir ../tessdata
+           tesseract test/testing/raaj.tif - -l hin --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/viet.tif - -l vie --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/hebrew.png - -l heb --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best
+           tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ../tessdata
+
+    - name: Run Tesseract basicapitest
+      run: |
+           export "PKG_CONFIG_PATH=/usr/local/lib/pkgconfig"
+           cd test
+           ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp -I/usr/local/include -L/usr/local/lib `pkg-config --cflags --libs tesseract lept ` -pthread -std=c++11
+           ./basicapitest
+
+    - name: Display Compiler Version
+      run: |
+           ${{ matrix.config.cxx }} --version
+           git log -3 --pretty=format:'%h %ad %s | %an'
+      if: always()
+
+    - name: Display Unit Tests Report
+      run: |
+           cat test-suite.log
+      if: always()
+
+# ============================================================================================
+
+  brew:
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - { name: macos-10.15-clang-12-autotools, os: macos-10.15, cxx: clang++ }
+          - { name: macos-10.15-gcc-10-autotools, os: macos-10.15, cxx: g++-10 }
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Get fonts, tessdata and langdata required for unit tests
+      run: |
+           git clone https://github.com/egorpugin/tessdata tessdata_unittest
+           cp tessdata_unittest/fonts/* test/testing/
+           mv tessdata_unittest/* ../
+
+    - name: Install dependencies
+      run: |
+           brew install automake autoconf-archive
+           brew install leptonica
+           brew install cairo pango icu4c
+           brew install cabextract
+           brew install libarchive curl
+
+    - name: Setup Tesseract
+      run: |
+           mkdir -p m4
+           ./autogen.sh
+
+    - name: Configure Tesseract
+      run: |
+           ./configure '--disable-shared' '--disable-openmp' '--disable-doc' '--with-pic' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2' "PKG_CONFIG_PATH=$(brew --prefix)/opt/icu4c/lib/pkgconfig:$(brew --prefix)/opt/libarchive/lib/pkgconfig:$(brew --prefix)/opt/libffi/lib/pkgconfig"
+
+    - name: Make and Install Tesseract
+      run: |
+           make -j 8
+           sudo make install install
+    - name: Make and Install Training Tools
+      run: |
+           make training -j 8
+           sudo make install training-install
+
+    - name: Make and run Unit Tests (clang)
+      if: startsWith(matrix.config.cxx, 'clang')
+      run: |
+           make check
+
+    - name: Make and run Unit Tests (unset LANG needed for g++-8, g++-9, g++-10 on macOS)
+      if: startsWith(matrix.config.cxx, 'g')
+      shell: bash
+      run: |
+           unset LANG LC_ALL LC_CTYPE
+           locale
+           make check
+
+    - name: Display Version for tesseract, lstmtraining, text2image
+      run: |
+           tesseract -v
+           lstmtraining -v
+           text2image -v
+      if: success() || failure()
+
+    - name: List languages in different test tessdata-dir
+      run: |
+           tesseract  --list-langs --tessdata-dir ../tessdata
+           tesseract  --list-langs --tessdata-dir ../tessdata_best
+           tesseract  --list-langs --tessdata-dir ../tessdata_fast
+
+    - name: Run Tesseract on test images in different languages
+      run: |
+           tesseract test/testing/phototest.tif - --oem 1  --tessdata-dir ../tessdata
+           tesseract test/testing/raaj.tif - -l hin --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/viet.tif - -l vie --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/hebrew.png - -l heb --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best
+           tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ../tessdata
+
+    - name: Run Tesseract basicapitest
+      run: |
+           export "PKG_CONFIG_PATH=/usr/local/lib/pkgconfig"
+           cd test
+           ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp -I/usr/local/include -L/usr/local/lib `pkg-config --cflags --libs tesseract lept ` -pthread -std=c++11
+           ./basicapitest
+
+    - name: Display Compiler Version
+      run: |
+           ${{ matrix.config.cxx }} --version
+           git log -3 --pretty=format:'%h %ad %s | %an'
+      if: always()
+
+    - name: Display Unit Tests Report
+      run: |
+           cat test-suite.log
+      if: always()
+
+# ============================================================================================
+
+  ports:
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - { name: macos-10.15-clang-12-autotools, os: macos-10.15, cxx: clang++ }
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Get fonts, tessdata and langdata required for tests
+      run: |
+           git clone https://github.com/egorpugin/tessdata tessdata_unittest
+           cp tessdata_unittest/fonts/* test/testing/
+           mv tessdata_unittest/* ../
+
+    - name: Install Macports
+      run: |
+        curl -LO https://raw.githubusercontent.com/GiovanniBussi/macports-ci/master/macports-ci; source ./macports-ci install
+
+    - name: Install Dependencies
+      run: |
+           sudo port install autoconf autoconf-archive automake libtool pkgconfig
+           sudo port install leptonica
+           sudo port install cairo pango
+           sudo port install icu +devel
+           sudo port install cabextract libarchive curl
+
+    - name: Setup Tesseract
+      run: |
+           mkdir -p m4
+           ./autogen.sh
+
+    - name: Configure Tesseract
+      run: |
+           ./configure  '--disable-shared' '--disable-openmp' '--disable-doc' '--with-pic' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'
+
+    - name: Make and Install Tesseract
+      run: |
+           make -j 8
+           sudo make install install
+
+    - name: Make and Install Training Tools
+      run: |
+           make training -j 8
+           sudo make install training-install
+
+    - name: Make and run Unit Tests (clang)
+      if: startsWith(matrix.config.cxx, 'clang')
+      run: |
+           make check
+
+    - name: Display Version for tesseract, lstmtraining, text2image
+      run: |
+           tesseract -v
+           lstmtraining -v
+           text2image -v
+      if: success() || failure()
+
+    - name: List languages in different test tessdata-dir
+      run: |
+           tesseract  --list-langs --tessdata-dir ../tessdata
+           tesseract  --list-langs --tessdata-dir ../tessdata_best
+           tesseract  --list-langs --tessdata-dir ../tessdata_fast
+
+    - name: Run Tesseract on test images in different languages
+      run: |
+           tesseract test/testing/phototest.tif - --oem 1  --tessdata-dir ../tessdata
+           tesseract test/testing/raaj.tif - -l hin --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/viet.tif - -l vie --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/hebrew.png - -l heb --oem 1   --tessdata-dir ../tessdata
+           tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best
+           tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ../tessdata
+
+    - name: Run Tesseract basicapitest
+      run: |
+           export "PKG_CONFIG_PATH=/usr/local/lib/pkgconfig"
+           cd test
+           ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp -I/opt/local/include -L/opt/local/lib -I/usr/local/include -L/usr/local/lib `pkg-config --cflags --libs tesseract lept ` -pthread -std=c++11
+           ./basicapitest
+
+    - name: Display Compiler Version
+      run: |
+           ${{ matrix.config.cxx }} --version
+           git log -3 --pretty=format:'%h %ad %s | %an'
+      if: always()
+
+    - name: Display Unit Tests Report
+      run: |
+           cat test-suite.log
+      if: always()
+
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@ -0,0 +1,152 @@
+name: cmake
+# cmake build of tesseract and training tools on ubuntu and macOS homebrew using Ninja.
+# test command line version of tesseract. run basicapitest.
+on:
+  #push:
+  schedule:
+    - cron: 0 21 * * *
+
+jobs:
+  basictests:
+    name: ${{ matrix.config.name }}
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+
+          - { name: macos-10.15-clang-12-cmake, os: macos-10.15, cxx: clang++ } # defualt
+          - { name: macos-10.15-clang-11-cmake, os: macos-10.15, cxx: '$(brew --prefix llvm)/bin/clang++' }  #installed
+          - { name: macos-10.15-gcc-8-cmake, os: macos-10.15, cxx: g++-8 } #installed
+          - { name: macos-10.15-gcc-9-cmake, os: macos-10.15, cxx: g++-9 } #installed
+          - { name: macos-10.15-gcc-10-cmake, os: macos-10.15, cxx: g++-10 } #installed
+
+          - { name: ubuntu-18.04-clang-7-cmake, os: ubuntu-18.04, cxx: clang++-7 }
+          - { name: ubuntu-18.04-clang-8-cmake, os: ubuntu-18.04, cxx: clang++-8 } #installed
+          - { name: ubuntu-18.04-clang-9-cmake, os: ubuntu-18.04, cxx: clang++-9 } #installed
+          - { name: ubuntu-18.04-clang-10-cmake, os: ubuntu-18.04, cxx: clang++-10 }
+
+          - { name: ubuntu-18.04-gcc-7-cmake, os: ubuntu-18.04, cxx: g++-7 } #installed
+          - { name: ubuntu-18.04-gcc-8-cmake, os: ubuntu-18.04, cxx: g++-8 } #installed
+          - { name: ubuntu-18.04-gcc-9-cmake, os: ubuntu-18.04, cxx: g++-9 } #installed
+          - { name: ubuntu-18.04-gcc-10-cmake, os: ubuntu-18.04, cxx: g++-10 } #installed
+
+          - { name: ubuntu-20.04-clang-7-cmake, os: ubuntu-20.04, cxx: clang++-7 }
+          - { name: ubuntu-20.04-clang-8-cmake, os: ubuntu-20.04, cxx: clang++-8 } #installed
+          - { name: ubuntu-20.04-clang-9-cmake, os: ubuntu-20.04, cxx: clang++-9 } #installed
+          - { name: ubuntu-20.04-clang-10-cmake, os: ubuntu-20.04, cxx: clang++-10 } #installed
+
+          - { name: ubuntu-20.04-gcc-7-cmake, os: ubuntu-20.04, cxx: g++-7 } #installed
+          - { name: ubuntu-20.04-gcc-8-cmake, os: ubuntu-20.04, cxx: g++-8 } #installed
+          - { name: ubuntu-20.04-gcc-9-cmake, os: ubuntu-20.04, cxx: g++-9 } #installed
+          - { name: ubuntu-20.04-gcc-10-cmake, os: ubuntu-20.04, cxx: g++-10 } #installed
+
+    steps:
+      - name: Install compilers on Linux
+        run: |
+             sudo apt-get install ${{ matrix.config.cxx }} -y
+        if: runner.os == 'Linux'
+
+# sudo apt-get install libarchive-dev libcurl4-openssl-dev libcurl4 curl -y
+      - name: Install dependencies on Linux
+        run: |
+           sudo apt-get install autoconf-archive libleptonica-dev -y
+           sudo apt-get install libicu-dev libpango1.0-dev libcairo2-dev -y
+           sudo apt-get install cabextract -y
+           sudo apt-get install ninja-build -y
+        if: runner.os == 'Linux'
+
+      - name: Install dependencies on macOS
+        run: |
+           brew install automake autoconf-archive
+           brew install leptonica
+           brew install cairo pango icu4c
+           brew install cabextract
+           brew install ninja
+           ninja --version
+           cmake --version
+        if: runner.os == 'macOS'
+
+      - name: Checkout Source
+        uses: actions/checkout@v2
+        with:
+             submodules: recursive
+
+      - name: Configure Tesseract (Linux)
+        run: |
+             mkdir build
+             mkdir inst
+             cmake \
+               -S . \
+               -B build \
+               -G Ninja \
+               -DCMAKE_BUILD_TYPE=Release \
+               -DOPENMP_BUILD=OFF \
+               -DCMAKE_CXX_COMPILER=${{ matrix.config.cxx }} \
+               -DCMAKE_INSTALL_PREFIX:PATH=inst
+        if: runner.os == 'Linux'
+
+      - name: Configure Tesseract (macOS)
+        shell: bash
+        run: |
+             set -e
+             export PKG_CONFIG_PATH=$(brew --prefix)/opt/icu4c/lib/pkgconfig:$(brew --prefix)/opt/libarchive/lib/pkgconfig:/$(brew --prefix)/opt/libffi/lib/pkgconfig:$PKG_CONFIG_PATH
+             mkdir build
+             mkdir inst
+             cmake \
+               -S . \
+               -B build \
+               -G Ninja \
+               -DCMAKE_BUILD_TYPE=Release \
+               -DOPENMP_BUILD=OFF \
+               -DCMAKE_CXX_COMPILER=${{ matrix.config.cxx }} \
+               -DCMAKE_INSTALL_PREFIX:PATH=inst
+        if: runner.os == 'macOS'
+
+      - name: Build Tesseract
+        run: |
+             cmake --build build --config Release --target install
+
+      - name: Display Tesseract Version
+        run: |
+             build/inst/bin/tesseract -v
+
+      - name: Display Training Tools Version
+        run: |
+             build/inst/bin/lstmtraining -v
+             build/inst/bin/text2image -v
+
+      - name: Download fonts, tessdata and langdata required for tests
+        run: |
+             git clone https://github.com/egorpugin/tessdata tessdata_unittest
+             cp tessdata_unittest/fonts/* test/testing/
+             mv tessdata_unittest/* ../
+
+      - name: List languages in different tessdata-dir
+        run: |
+             build/inst/bin/tesseract  --list-langs --tessdata-dir ../tessdata
+             build/inst/bin/tesseract  --list-langs --tessdata-dir ../tessdata_best
+             build/inst/bin/tesseract  --list-langs --tessdata-dir ../tessdata_fast
+
+      - name: Run Tesseract on test images in different languages
+        run: |
+             build/inst/bin/tesseract test/testing/phototest.tif - --oem 1  --tessdata-dir ../tessdata
+             build/inst/bin/tesseract test/testing/raaj.tif - -l hin --oem 1   --tessdata-dir ../tessdata
+             build/inst/bin/tesseract test/testing/viet.tif - -l vie --oem 1   --tessdata-dir ../tessdata
+             build/inst/bin/tesseract test/testing/hebrew.png - -l heb --oem 1   --tessdata-dir ../tessdata
+             build/inst/bin/tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best
+             build/inst/bin/tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ../tessdata
+
+      - name: Build and run basicapitest
+        run: |
+             export "PKG_CONFIG_PATH=$GITHUB_WORKSPACE/build/inst/lib/pkgconfig/:$PKG_CONFIG_PATH"
+             cd test
+             ${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp "-I$GITHUB_WORKSPACE/build/inst/include" "-L$GITHUB_WORKSPACE/build/inst/lib" `pkg-config --cflags --libs tesseract lept ` -pthread -std=c++11
+             ./basicapitest
+
+      - name: Display Compiler Version
+        run: |
+             ${{ matrix.config.cxx }} --version
+             git log -3 --pretty=format:'%h %ad %s | %an'
+        if: always()
+
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@ -1,45 +0,0 @@
-name: linux
-
-on: [push]
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest]
-
-    steps:
-    - uses: actions/checkout@v1
-
-    - name: Download SW
-      shell: cmake -P {0}
-      run: |
-        if (WIN32)
-          file(DOWNLOAD "https://software-network.org/client/sw-master-windows-client.zip" ./sw.zip)
-        elseif (APPLE)
-          file(DOWNLOAD "https://software-network.org/client/sw-master-macos-client.tar.gz" ./sw.zip)
-        else()
-          file(DOWNLOAD "https://software-network.org/client/sw-master-linux-client.tar.gz" ./sw.zip)
-        endif()
-
-    - name: Unpack SW
-      run: |
-        cmake -E tar xvf sw.zip
-        chmod 755 sw
-
-    - name: gcc
-      run: |
-        sudo add-apt-repository ppa:jonathonf/gcc-9.0
-        sudo apt-get update
-        sudo apt-get install g++-9
-
-    - name: llvm
-      run: |
-        wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key 2>/dev/null | sudo apt-key add -
-        sudo add-apt-repository 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main' -y
-        sudo apt-get update -q
-        sudo apt-get install -y clang-9 lld-9 libc++-9-dev libc++abi-9-dev clang-tools-9
-
-    - name: build
-      run: ./sw -static -shared -config d,r build
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@ -1,34 +0,0 @@
-name: macos
-
-on: [push]
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [macOS-latest]
-
-    steps:
-    - uses: actions/checkout@v1
-
-    - name: Download SW
-      shell: cmake -P {0}
-      run: |
-        if (WIN32)
-          file(DOWNLOAD "https://software-network.org/client/sw-master-windows-client.zip" ./sw.zip)
-        elseif (APPLE)
-          file(DOWNLOAD "https://software-network.org/client/sw-master-macos-client.tar.gz" ./sw.zip)
-        else()
-          file(DOWNLOAD "https://software-network.org/client/sw-master-linux-client.tar.gz" ./sw.zip)
-        endif()
-
-    - name: Unpack SW
-      run: cmake -E tar xvf sw.zip
-
-    - name: chmod
-      run: chmod 755 sw
-      shell: sh
-
-    - name: build
-      run: ./sw -static -shared -config d,r build
--- a/.github/workflows/sw.yml
+++ b/.github/workflows/sw.yml
@ -0,0 +1,73 @@
+name: sw
+
+on:
+  push:
+  pull_request:
+  schedule:
+    # every day
+    - cron: 0 0 * * *
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [windows-latest, ubuntu-20.04, macOS-latest]
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: recursive
+    - uses: egorpugin/sw-action@master
+
+    - name: build
+      if: matrix.os == 'windows-latest'
+      run: ./sw -static -shared -platform x86,x64 -config d,r build
+
+    - name: build
+      if: matrix.os != 'windows-latest'
+      run: ./sw -static -shared -config d,r build -Dwith-tests=1
+
+    - name: download test data
+      run: git clone https://github.com/egorpugin/tessdata tessdata_unittest
+
+    - name: copy fonts
+      if: matrix.os != 'windows-latest'
+      run: cp tessdata_unittest/fonts/* test/testing/
+
+    - name: copy fonts
+      if: matrix.os == 'windows-latest'
+      run: Copy-Item -Path "tessdata_unittest\fonts\*" -Destination "test\testing" -Recurse
+      shell: pwsh
+
+    - name: test
+      if: matrix.os != 'windows-latest'
+      run: ./sw -static -shared -config "d,r" test -Dwith-tests=1 "-Dskip-tests=lstm,lstm_recode"
+      continue-on-error: true
+
+    - name: test-nightly
+      if: matrix.os != 'windows-latest' && github.event.schedule=='0 0 * * *'
+      run: ./sw -static -shared -config "d,r" test -Dwith-tests=1
+      continue-on-error: true
+
+    # windows tests hang here for some reason, investigate
+    #- name: test
+      #if: matrix.os == 'windows-latest'
+      #run: ./sw test -Dwith-tests=1 "-Dskip-tests=lstm,lstm_recode"
+      #continue-on-error: true
+
+    - name: Upload Unit Test Results
+      if: always() && matrix.os != 'windows-latest'
+      uses: actions/upload-artifact@v2
+      with:
+        name: Test Results (${{ matrix.os }})
+        path: .sw/test/results.xml
+
+    - name: Publish Test Report
+      if: always() && matrix.os != 'windows-latest'
+      uses: mikepenz/action-junit-report@v1
+      with:
+        check_name: test (${{ matrix.os }})
+        report_paths: .sw/test/results.xml
+        github_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/unittest-disablelegacy.yml
+++ b/.github/workflows/unittest-disablelegacy.yml
@ -0,0 +1,76 @@
+name: unittest-disablelegacy
+# autotools build on ubuntu, unittests with disabled legacy engine.
+# currently some unittests are failing with disabled legacy engine.
+
+on:
+  #push:
+  schedule:
+    - cron: 0 0 1 * *
+
+jobs:
+  linux:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        compiler: [ g++, clang++ ]
+        os: [ ubuntu-18.04, ubuntu-20.04 ]
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Install dependencies
+      run: |
+           sudo apt-get install autoconf-archive libleptonica-dev libpango1.0-dev -y
+           sudo apt-get install cabextract -y
+
+    - name: Setup
+      run: |
+           mkdir -p m4
+           ./autogen.sh
+
+    - name: Configure
+      run: |
+           ./configure '--disable-shared' '--disable-legacy' 'CXX=${{ matrix.compiler }}'
+
+    - name: Make and Install Tesseract
+      run: |
+           make -j 8
+           sudo make install install
+
+    - name: Make and Install Training Tools
+      run: |
+           make training -j 8
+           sudo make install training-install
+
+    - name: Display Version
+      run: |
+           ${{ matrix.compiler }} --version
+           tesseract -v
+           lstmtraining -v
+           text2image -v
+      if: success() || failure()
+
+    - name: Download fonts, tessdata and langdata required for tests
+      run: |
+           git clone https://github.com/egorpugin/tessdata tessdata_unittest
+           cp tessdata_unittest/fonts/* test/testing/
+           mv tessdata_unittest/* ../
+
+    - name: Run Tesseract on phototest.tif and devatest.png
+      run: |
+           tesseract test/testing/phototest.tif -  --tessdata-dir ../tessdata
+           tesseract test/testing/devatest.png - -l hin+eng  --tessdata-dir ../tessdata
+
+    - name: Make and run Unit Tests
+      run: |
+           make check -j 4
+
+    - name: Display Unit Tests Report
+      run: |
+           git log -3
+           ${{ matrix.compiler }} --version
+           cat test-suite.log
+      if: always()
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@ -0,0 +1,100 @@
+name: unittest
+# autotools build on ubuntu and macOS homebrew.
+# unittests with address sanitizers.
+# [  FAILED  ] LSTMTrainerTest.DeterminismTest - clang version 9.0.0-2~ubuntu18.04.2
+on:
+  #push:
+  schedule:
+    - cron: 0 19 * * *
+
+jobs:
+  sanitizers:
+    name: ${{ matrix.config.name }}
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - { name: macos-10.15-clang-unittest, os: macos-10.15, cxx: clang++ }
+          - { name: ubuntu-18.04-clang-8-unittest, os: ubuntu-18.04, cxx: clang++-8 }
+          - { name: ubuntu-18.04-gcc-unittest, os: ubuntu-18.04, cxx: g++ }
+          - { name: ubuntu-20.04-clang-10-unittest, os: ubuntu-20.04, cxx: clang++-10 }
+          - { name: ubuntu-20.04-gcc-unittest, os: ubuntu-20.04, cxx: g++ }
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Install dependencies (Linux)
+      run: |
+           sudo apt-get install autoconf-archive libleptonica-dev libpango1.0-dev -y
+           sudo apt-get install cabextract -y
+      if: runner.os == 'Linux'
+
+    - name: Install dependencies (MacOS Homebrew)
+      run: |
+           brew install automake autoconf-archive libarchive
+           brew install leptonica cairo pango
+           brew install cabextract abseil
+      if: runner.os == 'macOS'
+
+    - name: Setup
+      run: |
+           mkdir -p m4
+           ./autogen.sh
+
+    - name: Configure (Linux)
+      run: |
+           ./configure '--disable-shared' \
+                'CXX=${{ matrix.config.cxx }}' \
+                'CXXFLAGS=-g -O2 -fsanitize=address,undefined'
+      if: runner.os == 'Linux'
+
+    - name: Configure (MacOS Homebrew)
+      run: |
+           ./configure '--disable-shared' '--with-pic' \
+                'CXX=${{ matrix.config.cxx }}' \
+                'CXXFLAGS=-g -O2 -fsanitize=address,undefined' \
+                "PKG_CONFIG_PATH=$(brew --prefix)/opt/icu4c/lib/pkgconfig:$(brew --prefix)/opt/libarchive/lib/pkgconfig:$(brew --prefix)/opt/libffi/lib/pkgconfig"
+      if: runner.os == 'macOS'
+
+    - name: Make and Install Tesseract
+      run: |
+           make -j 8
+           sudo make install
+
+    - name: Make and Install Training Tools
+      run: |
+           make training -j 8
+           sudo make training-install
+
+    - name: Display Version
+      run: |
+           ${{ matrix.config.cxx }} --version
+           tesseract -v
+           lstmtraining -v
+           text2image -v
+      if: success() || failure()
+
+    - name: Download fonts, tessdata and langdata required for tests
+      run: |
+           git clone https://github.com/egorpugin/tessdata tessdata_unittest
+           cp tessdata_unittest/fonts/* test/testing/
+           mv tessdata_unittest/* ../
+
+    - name: Run Tesseract on phototest.tif and devatest.png
+      run: |
+           tesseract test/testing/phototest.tif -  --tessdata-dir ../tessdata
+           tesseract test/testing/devatest.png - -l hin+eng  --tessdata-dir ../tessdata
+
+    - name: Make and run Unit Tests
+      run: |
+           make check -j 4
+
+    - name: Display Unit Tests Report
+      run: |
+           cat test-suite.log
+           ${{ matrix.config.cxx }} --version
+           git log -3 --pretty=format:'%h %ad %s | %an'
+      if: always()
--- a/.github/workflows/vcpkg-4.11.yml
+++ b/.github/workflows/vcpkg-4.11.yml
@ -0,0 +1,109 @@
+name: vcpkg-4.11
+# build tesseract 4.1 using vcpkg and cmake on ubuntu and windows.
+# build and run basicapitest on windows.
+# macos fails on leptonica build - https://github.com/microsoft/vcpkg/issues/16116
+on:
+  #push:
+  schedule:
+    - cron: 0 22 * * *
+env:
+    DEVELOPER_DIR: /Applications/Xcode_11.2.1.app/Contents/Developer
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+
+    steps:
+      - name: Checkout Tesseract Source (for test images)
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+
+      - name: Install vcpkg (Linux)
+        run: |
+             git clone https://github.com/microsoft/vcpkg
+             vcpkg/bootstrap-vcpkg.sh
+             vcpkg/vcpkg integrate install
+        if: runner.os == 'Linux'
+
+      - name: Build Tesseract 4.1.1 (Linux)
+        run: |
+             vcpkg/vcpkg install tesseract:x64-linux
+        if: runner.os == 'Linux'
+
+      - name: Visual Studio Setup (Windows)
+        shell: cmd
+        run: |
+             call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+        if: runner.os == 'Windows'
+
+      - name: Install vcpkg (Windows)
+        run: |
+             git clone https://github.com/microsoft/vcpkg
+             vcpkg/bootstrap-vcpkg.bat
+             vcpkg/vcpkg integrate install
+        if: runner.os == 'Windows'
+
+      - name: Build and Install Tesseract and dependencies using vcpkg (Windows)
+        run: |
+            vcpkg/vcpkg install tesseract:x64-windows
+        if: runner.os == 'Windows'
+
+      - name: Download tessdata used for tests
+        run: |
+             git clone https://github.com/egorpugin/tessdata tessdata_unittest
+             mv tessdata_unittest/* ../
+        if: runner.os == 'Windows'
+        
+      - name: Create CMakeLists.txt file for basicapitest
+        shell: bash
+        run: |
+             cd test
+             cat << "EOF" > CMakeLists.txt
+             cmake_minimum_required(VERSION 3.19)
+             project( basicapitest )
+             find_package( Tesseract REQUIRED )
+             find_package( Leptonica REQUIRED )
+             include_directories(${Tesseract_INCLUDE_DIRS})
+             include_directories(${Leptonica_INCLUDE_DIRS})
+             add_executable( basicapitest testing/basicapitest.cpp )
+             target_link_libraries(basicapitest ${Leptonica_LIBRARIES})
+             target_link_libraries(basicapitest ${Tesseract_LIBRARIES})
+             target_link_libraries(basicapitest libtesseract)
+             EOF
+             cat CMakeLists.txt
+        if: runner.os == 'Windows'
+        
+      - name: Configure basicapitest
+        run: |
+             cd test
+             cmake . "-DCMAKE_TOOLCHAIN_FILE=${env:GITHUB_WORKSPACE}/vcpkg/scripts/buildsystems/vcpkg.cmake"
+        if: runner.os == 'Windows'
+
+      - name: Build basicapitest
+        run: |
+             cd test
+             cmake --build .  --config Release
+        if: runner.os == 'Windows'
+        
+      - name: Run basicapitest (Windows)
+        run: |
+             cd test
+             D:\a\tesseract\tesseract\test\Release\basicapitest.exe
+        if: runner.os == 'Windows'
+
+      - name: Build Tesseract 4.1.1 (macOS) Fails
+        run: |
+             xcode-select --print-path
+             vcpkg install leptonica:x64-osx
+             vcpkg install tesseract:x64-osx
+        if: runner.os == 'macOS'
+
+      - name: Display Leptonica error log (macOS) Fails
+        run: |
+             cat /usr/local/share/vcpkg/buildtrees/leptonica/install-x64-osx-dbg-out.log
+        if: runner.os == 'macOS'
--- a/.github/workflows/vcpkg.yml
+++ b/.github/workflows/vcpkg.yml
@ -0,0 +1,100 @@
+name: vcpkg
+# build and test of tesseract on windows using vcpkg and cmake.
+# vcpkg with -head does not work. https://github.com/microsoft/vcpkg/issues/16019
+on:
+  #push:
+  schedule:
+    - cron: 0 23 * * *
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [windows-2019]
+
+    steps:
+      - name: Checkout Tesseract Source (--head from master branch)
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+
+      - name: Visual Studio Setup
+        shell: cmd
+        run: |
+             call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+
+      - name: Install vcpkg
+        run: |
+             git clone https://github.com/microsoft/vcpkg
+             vcpkg/bootstrap-vcpkg.bat
+             vcpkg/vcpkg integrate install
+
+      - name: Build and Install Leptonica and image libraries using vcpkg
+        run: |
+             vcpkg/vcpkg install leptonica:x64-windows
+
+      - name: Configure and Build Tesseract (--head from master branch) with cmake
+        run: |
+             cmake . -B build -DCMAKE_BUILD_TYPE=Release -DSW_BUILD=OFF -DOPENMP_BUILD=OFF -DBUILD_TRAINING_TOOLS=OFF "-DCMAKE_TOOLCHAIN_FILE=${env:GITHUB_WORKSPACE}/vcpkg/scripts/buildsystems/vcpkg.cmake"
+             cmake --build build --config Release --target install
+
+      - name: Display Tesseract Version
+        run: |
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe --version
+
+      - name: Create CMakeLists.txt file for basicapitest
+        shell: bash
+        run: |
+             cd test
+             cat << "EOF" > CMakeLists.txt
+             cmake_minimum_required(VERSION 3.19)
+             project( basicapitest )
+             find_package( Tesseract REQUIRED )
+             find_package( Leptonica REQUIRED )
+             include_directories(${Tesseract_INCLUDE_DIRS})
+             include_directories(${Leptonica_INCLUDE_DIRS})
+             add_executable( basicapitest testing/basicapitest.cpp )
+             target_link_libraries(basicapitest ${Leptonica_LIBRARIES})
+             target_link_libraries(basicapitest ${Tesseract_LIBRARIES})
+             add_library(libtesseract UNKNOWN IMPORTED)
+             set_property(TARGET libtesseract PROPERTY IMPORTED_LOCATION D:/a/tesseract/tesseract/build/Release/tesseract50.lib)
+             target_link_libraries(basicapitest libtesseract)
+             EOF
+             cat CMakeLists.txt
+
+      - name: Configure basicapitest
+        run: |
+             cd test
+             cmake . "-DCMAKE_TOOLCHAIN_FILE=${env:GITHUB_WORKSPACE}/vcpkg/scripts/buildsystems/vcpkg.cmake"
+
+      - name: Build basicapitest
+        run: |
+             cd test
+             cmake --build .  --config Release
+
+      - name: Download tessdata and image files used for tests
+        run: |
+             git clone https://github.com/egorpugin/tessdata tessdata_unittest
+             mv tessdata_unittest/* ../
+
+      - name: Run basicapitest
+        run: |
+             cd test
+             D:\a\tesseract\tesseract\test\Release\basicapitest.exe
+
+      - name: Run Tesseract CLI on test images in different languages
+        run: |
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  test\testing\phototest.tif - --oem 1  --tessdata-dir ..\tessdata
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  test\testing\raaj.tif - -l hin --oem 1   --tessdata-dir ..\tessdata
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  test\testing\viet.tif - -l vie --oem 1   --tessdata-dir ..\tessdata
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  test\testing\hebrew.png - -l heb --oem 1   --tessdata-dir ..\tessdata
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  test\testing\eurotext.tif - -l fra --oem 1 --tessdata-dir ..\tessdata_best
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  test\testing\arabic.tif - -l ara --oem 1 --psm 6  --tessdata-dir ..\tessdata
+
+      - name: List languages in different test tessdata-dir
+        run: |
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  --list-langs --tessdata-dir ..\tessdata
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  --list-langs --tessdata-dir ..\tessdata_best
+             D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe  --list-langs --tessdata-dir ..\tessdata_fast
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@ -1,35 +0,0 @@
-name: windows
-
-on: [push]
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [windows-latest]
-        #os: [windows-latest, ubuntu-latest, macOS-latest]
-
-    steps:
-    - uses: actions/checkout@v1
-
-    - name: Download SW
-      shell: cmake -P {0}
-      run: |
-        if (WIN32)
-          file(DOWNLOAD "https://software-network.org/client/sw-master-windows-client.zip" ./sw.zip)
-        elseif (APPLE)
-          file(DOWNLOAD "https://software-network.org/client/sw-master-macos-client.tar.gz" ./sw.zip)
-        else()
-          file(DOWNLOAD "https://software-network.org/client/sw-master-linux-client.tar.gz" ./sw.zip)
-        endif()
-
-    - name: Unpack SW
-      run: cmake -E tar xvf sw.zip
-
-    - name: chmod
-      run: chmod 755 sw
-      shell: sh
-
-    - name: build
-      run: ./sw -static -shared -platform x86,x64 -config d,r build
--- a/.gitignore
+++ b/.gitignore
@ -63,7 +63,7 @@ config_auto.h
 # ignore compilation files
 build/*
 /bin
-*/.deps/*
+.deps
 .dirstamp
 /.libs
 */.libs/*
@ -80,6 +80,7 @@ __pycache__

 # tessdata
 *.traineddata
+tessdata_*

 # OpenCL
 tesseract_opencl_profile_devices.dat
--- a/.gitmodules
+++ b/.gitmodules
@ -6,4 +6,4 @@
 	url = https://github.com/google/googletest.git
 [submodule "test"]
 	path = test
-	url = https://github.com/tesseract-ocr/test
+	url = https://github.com/tesseract-ocr/test.git
--- a/.travis.yml
+++ b/.travis.yml
@ -1,51 +1,30 @@
 # Travis CI configuration for Tesseract
-
-language: cpp
-
-dist: xenial
-
-env:
-  - LEPT_VER=1.78.0
-
+sudo: false
 notifications:
  email: false
+language: cpp

-sudo: false
-
-os:
-  - linux
-  - osx
-
-addons:
-  apt:
-    sources:
-    #- ubuntu-toolchain-r-test
-    packages:
-      - libarchive-dev
-      - libpango1.0-dev
-    #- g++-6
-
-#matrix:
-  #include:
-    #- os: osx
-      #install:
-      #script: brew install tesseract --HEAD
-      #cache:
-        #directories:
-          #- $HOME/Library/Caches/Homebrew
-  #allow_failures:
-    #- script: brew install tesseract --HEAD
-
+os: linux
+dist: focal
+arch:
+  - amd64
+  - arm64
+  - ppc64le
+  - s390x
+compiler:
+  - gcc
+  - clang
+env:
+  - LEPT_VER=1.80.0
 cache:
  directories:
-  - leptonica-$LEPT_VER
+    - leptonica-$LEPT_VER

 before_install:
-  - if [[ $TRAVIS_OS_NAME == linux   ]]; then LINUX=true; fi
-  - if [[ $TRAVIS_OS_NAME == osx     ]]; then OSX=true; fi
+  - sudo apt-get install libpango1.0-dev libicu-dev libtiff5-dev -y
+  - rm -rf leptonica-$LEPT_VER/usr
  
 install:
-  #- if [[ $LINUX && "$CXX" = "g++" ]]; then export CXX="g++-6" CC="gcc-6"; fi
  - if test ! -d leptonica-$LEPT_VER/src; then curl -Ls https://github.com/DanBloomberg/leptonica/archive/$LEPT_VER.tar.gz | tar -xz; fi
  - if test ! -d leptonica-$LEPT_VER/usr; then cmake -Hleptonica-$LEPT_VER -Bleptonica-$LEPT_VER/build -DCMAKE_INSTALL_PREFIX=leptonica-$LEPT_VER/usr; fi
  - if test ! -e leptonica-$LEPT_VER/usr/lib/libleptonica.so; then make -C leptonica-$LEPT_VER/build install; fi
@ -55,3 +34,12 @@ script:
  - cd build
  - cmake .. -DLeptonica_DIR=leptonica-$LEPT_VER/build -DSW_BUILD=OFF
  - make
+  - sudo make install
+  
+#after_script: # let those commands trigger build errors
+  - tesseract -v
+  - text2image -v
+  - lstmtraining -v
+  - ls /home/travis/build/tesseract-ocr/tesseract/test/testing/*.tif
+  - wget https://github.com/egorpugin/tessdata/raw/master/tessdata/eng.traineddata
+  - tesseract /home/travis/build/tesseract-ocr/tesseract/test/testing/phototest.tif - -l eng --tessdata-dir ./
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -32,6 +32,13 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${EXECUTABLE_OUTPUT_PATH}")
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake Targets")

+if(NOT ${CMAKE_VERSION} VERSION_LESS "3.15.0")
+    if (WIN32)
+        cmake_policy(SET CMP0091 NEW)
+        message(STATUS "Setting policy CMP0091 to NEW")
+    endif()
+endif()
+
 ###############################################################################
 #
 # project settings
@ -75,12 +82,17 @@ else()
    option(SW_BUILD "Build with sw" OFF)
 endif()
 option(OPENMP_BUILD "Build with openmp support" OFF)  # see issue #1662
-option(AUTO_OPTIMIZE "Usage of cmake auto optimize macros (not suitable for portable build)" ON)
 option(GRAPHICS_DISABLED "Disable disable graphics (ScrollView)" OFF)
 option(DISABLED_LEGACY_ENGINE "Disable the legacy OCR engine" OFF)
+option(ENABLE_LTO "Enable link-time optimization" OFF)
 option(BUILD_TRAINING_TOOLS "Build training tools" ON)
 option(BUILD_TESTS "Build tests" OFF)
 option(USE_SYSTEM_ICU "Use system ICU" OFF)
+if(NOT ${CMAKE_VERSION} VERSION_LESS "3.15.0")
+    if(WIN32 AND MSVC)
+        option(WIN32_MT_BUILD "Build with MT flag for MSVC" OFF)
+    endif()
+endif()

 ###############################################################################
 #
@ -88,6 +100,10 @@ option(USE_SYSTEM_ICU "Use system ICU" OFF)
 #
 ###############################################################################

+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(CLANG 1)
+endif()
+
 if(NOT CMAKE_BUILD_TYPE)
    message(STATUS "Setting build type to 'Release' as none was specified.")
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
@ -96,44 +112,120 @@ endif()

 include(CheckCXXCompilerFlag)

-# Check for C++ standard to use
-get_property(known_features GLOBAL PROPERTY CMAKE_CXX_KNOWN_FEATURES)
-if (cxx_std_17 IN_LIST known_features)
-    set(CMAKE_CXX_STANDARD 17)
-elseif (cxx_std_14 IN_LIST known_features)
-    set(CMAKE_CXX_STANDARD 14)
-else()  # minimum required standard
-    set(CMAKE_CXX_STANDARD 11)
-endif()
-
-# Avoid using experimental c++1y (c++1z) standard even if the compiler announces cxx14 (cxx17)
-# in CMAKE_CXX_KNOWN_FEATURES and CMAKE_CXX_COMPILE_FEATURES
-# It is the case of clang 3.9, 4.0 (announces c++1z) and gcc 4.8 (announces c++1y)
-if ("${CMAKE_CXX17_STANDARD_COMPILE_OPTION}" STREQUAL "-std=c++1z")
-  set(CMAKE_CXX_STANDARD 14)
-endif()
-if ("${CMAKE_CXX14_STANDARD_COMPILE_OPTION}" STREQUAL "-std=c++1y")
-  set(CMAKE_CXX_STANDARD 11)
-endif()
-
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)

-set(LIBRARY_TYPE SHARED)
-if (STATIC)
-    set(LIBRARY_TYPE)
+if (BUILD_SHARED_LIBS)
+    set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 endif()

-# auto optimize
-if (AUTO_OPTIMIZE)
-    include(OptimizeForArchitecture)
-    AutodetectHostArchitecture()
-    OptimizeForArchitecture()
+# LTO
+cmake_policy(SET CMP0069 NEW)
+include(CheckIPOSupported)
+check_ipo_supported(RESULT LTO_SUPPORTED OUTPUT error)
+if(LTO_SUPPORTED)
+    message(STATUS "IPO / LTO supported")
+else()
+    message(STATUS "IPO / LTO not supported: <${error}>")
 endif()
+
+CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+if(COMPILER_SUPPORTS_MARCH_NATIVE)
+    set(MARCH_NATIVE_FLAGS "${MARCH_NATIVE_FLAGS} -march=native -mtune=native")
+    set(MARCH_NATIVE_OPT ON)
+endif()
+
+# Flags for SIMD support
+set(HAVE_AVX FALSE)
+set(HAVE_AVX2 FALSE)
+set(HAVE_FMA FALSE)
+set(HAVE_SSE4_1 FALSE)
+set(HAVE_NEON FALSE)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64.*|AARCH64.*")
+ set(AARCH64 TRUE)
+endif()
+
+if(AARCH64)
+
+add_definitions("-DHAVE_NEON")
+set(HAVE_NEON TRUE)
+
+else()
+
+CHECK_CXX_COMPILER_FLAG("-mavx" HAVE_AVX)
+if(HAVE_AVX)
+    set(AVX_COMPILE_FLAGS "-mavx")
+    add_definitions("-DHAVE_AVX")
+endif(HAVE_AVX)
+
+CHECK_CXX_COMPILER_FLAG("-mavx2" HAVE_AVX2)
+if(HAVE_AVX2)
+    set(AVX2_COMPILE_FLAGS "-mavx2")
+    add_definitions("-DHAVE_AVX2")
+endif()
+
+CHECK_CXX_COMPILER_FLAG("-mfma" HAVE_FMA)
+if(HAVE_FMA)
+    set(FMA_COMPILE_FLAGS "-mfma")
+    add_definitions("-DHAVE_FMA")
+endif()
+
+CHECK_CXX_COMPILER_FLAG("-msse4.1" HAVE_SSE4_1)
+if(HAVE_SSE4_1)
+    set(SSE4_1_COMPILE_FLAGS "-msse4.1")
+    add_definitions("-DHAVE_SSE4_1")
+endif()
+
+if(NOT APPLE)
+    # NEON support relies on getauxval, which is not available on OSX, only on Linux and Android
+    CHECK_CXX_COMPILER_FLAG("-mfpu=neon" HAVE_NEON)
+    if(HAVE_NEON)
+        set(NEON_COMPILE_FLAGS "-mfpu=neon")
+        add_definitions("-DHAVE_NEON")
+    endif()
+endif(NOT APPLE)
+
+if(MSVC)
+    if(NOT HAVE_AVX)
+        set(AVX_COMPILE_FLAGS "/arch:AVX")
+        set(HAVE_AVX ON)
+        add_definitions("-DHAVE_AVX")
+    endif()
+
+    if(NOT HAVE_AVX2)
+        set(AVX2_COMPILE_FLAGS "/arch:AVX2")
+        set(HAVE_AVX2 ON)
+        add_definitions("-DHAVE_AVX2")
+        set(FMA_COMPILE_FLAGS "-D__FMA__")
+        set(HAVE_FMA ON)
+        add_definitions("-DHAVE_FMA")
+    endif()
+
+    if(NOT HAVE_SSE4_1)
+        set(SSE4_1_COMPILE_FLAGS "-D__SSE4_1__")
+        set(HAVE_SSE4_1 ON)
+        add_definitions("-DHAVE_SSE4_1")
+    endif()
+endif(MSVC)
+
+endif(AARCH64)
+
+# auto optimize - used only for information about available vectors
+include(OptimizeForArchitecture)
+OptimizeForArchitecture()
+# remove global definition to eliminate effect on build
+foreach(_flag ${_enable_vector_unit_list})
+    string(TOUPPER "${_flag}" _flag)
+    string(REPLACE "." "_" _flag "__${_flag}__")
+    remove_definitions("-D${_flag}")
+endforeach(_flag)
+foreach(flag ${Vc_ARCHITECTURE_FLAGS})
+    set(Vc_CXX_FLAGS "${Vc_CXX_FLAGS} ${flag}")
+endforeach()
+
 # Compiler specific environments
-if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  set(CLANG 1)
-endif()
 if(CMAKE_COMPILER_IS_GNUCXX OR MINGW)
    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -DDEBUG -pedantic -Og")
 elseif(MSVC)
@ -142,8 +234,14 @@ elseif(MSVC)
    if (NOT CLANG)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
    endif()
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /wd4244 /wd4305 /wd4267")
    # Don't use /Wall because it generates too many warnings.
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /W4 /bigobj")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /W0 /bigobj")
+    # MT flag
+    if(WIN32_MT_BUILD)
+        set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+        message (STATUS "Building with static CRT.")
+    endif()
 endif()
 if(CLANG)  # clang all platforms
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wno-unused-command-line-argument")
@ -176,7 +274,6 @@ if (OPENMP_BUILD)
    endif()
 endif()

-
 if (CYGWIN)
    add_definitions(-D__CYGWIN__)
 elseif(UNIX)
@ -187,6 +284,8 @@ elseif(WIN32)
    set(LIB_Ws2_32 Ws2_32)
 endif()

+add_definitions("-DCMAKE_BUILD")
+
 ###############################################################################
 #
 # packages
@ -195,10 +294,10 @@ endif()

 if (SW_BUILD)
    find_package(SW REQUIRED)
-    if (STATIC)
-        set(SW_BUILD_SHARED_LIBS 0)
-    else()
+    if (BUILD_SHARED_LIBS)
        set(SW_BUILD_SHARED_LIBS 1)
+    else()
+        set(SW_BUILD_SHARED_LIBS 0)
    endif()
    sw_add_package(
        org.sw.demo.danbloomberg.leptonica
@ -223,7 +322,12 @@ else()
        message(FATAL_ERROR "Cannot find required library Leptonica. Quitting!")
    endif(NOT Leptonica_FOUND)

-    find_package(LibArchive)
+    # Check for optional libarchive.
+    if(PKG_CONFIG_EXECUTABLE)
+        pkg_check_modules(LibArchive libarchive)
+    else()
+        find_package(LibArchive)
+    endif()
    if(LibArchive_FOUND)
        set(HAVE_LIBARCHIVE ON)
    endif()
@ -238,32 +342,9 @@ find_package(OpenCL QUIET)
 #
 ###############################################################################

-foreach(flag ${Vc_ARCHITECTURE_FLAGS})
-    set(Vc_CXX_FLAGS "${Vc_CXX_FLAGS} ${flag}")
-endforeach()
-
-# add definition as expected in src/arch/simddetect.cpp
-set(HAVE_AVX OFF)
-set(HAVE_AVX2 OFF)
-set(HAVE_FMA OFF)
-set(HAVE_SSE4_1 OFF)
-set(MARCH_NATIVE_OPT OFF)
-foreach(flag ${_enable_vector_unit_list})  # from OptimizeForArchitecture()
-    string(TOUPPER "${flag}" flag)
-    string(REPLACE "\." "_" flag "${flag}")
-    if("${flag}" MATCHES "AVX|AVX2|FMA|SSE4_1")
-        set(simd_flags "${simd_flags} -DHAVE_${flag}")
-        set("HAVE_${flag}" ON)
-    endif()
-endforeach(flag)
 if (NOT MSVC)
   set(MARCH_NATIVE_FLAGS "${MARCH_NATIVE_FLAGS} -O3 -ffast-math")
 endif()
-CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
-if(COMPILER_SUPPORTS_MARCH_NATIVE)
-    set(MARCH_NATIVE_FLAGS "${MARCH_NATIVE_FLAGS} -march=native -mtune=native")
-    set(MARCH_NATIVE_OPT ON)
-endif()

 set(AUTOCONFIG_SRC ${CMAKE_CURRENT_BINARY_DIR}/config_auto.h.in)
 set(AUTOCONFIG ${CMAKE_CURRENT_BINARY_DIR}/config_auto.h)
@ -277,29 +358,34 @@ include(Configure)

 configure_file(${AUTOCONFIG_SRC} ${AUTOCONFIG} @ONLY)

-set(INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include" "${CMAKE_INSTALL_PREFIX}/include/tesseract")
+set(INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")

 configure_file(
    ${CMAKE_CURRENT_SOURCE_DIR}/include/tesseract/version.h.in
    ${CMAKE_CURRENT_BINARY_DIR}/include/tesseract/version.h @ONLY)
-configure_file(
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/vs2010/tesseract/tesseract.rc.in
-    ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/tesseract.rc @ONLY)
-configure_file(
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/vs2010/tesseract/libtesseract.rc.in
-    ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc @ONLY)
-configure_file(
-    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/TesseractConfig-version.cmake.in
-    ${CMAKE_CURRENT_BINARY_DIR}/TesseractConfig-version.cmake @ONLY)
-configure_file(
-    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/TesseractConfig.cmake.in
-    ${CMAKE_CURRENT_BINARY_DIR}/TesseractConfig.cmake @ONLY)
+
+include(CMakePackageConfigHelpers)
+include(GenerateExportHeader)
+configure_package_config_file(
+    cmake/templates/TesseractConfig.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/tesseract/TesseractConfig.cmake
+    INSTALL_DESTINATION lib/cmake/tesseract
+    PATH_VARS INCLUDE_DIR)
+write_basic_package_version_file(
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/tesseract/TesseractConfigVersion.cmake
+    VERSION ${PACKAGE_VERSION}
+    COMPATIBILITY SameMajorVersion)

 # show summary of configuration
 if(${CMAKE_BUILD_TYPE} MATCHES Debug)
    set(COMPILER_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
 elseif(${CMAKE_BUILD_TYPE} MATCHES Release)
    set(COMPILER_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
+    if (LTO_SUPPORTED AND ENABLE_LTO)
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+    else()
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
+    endif()  # LTO_SUPPORTED
 endif()
 message( STATUS )
 message( STATUS "General configuration for Tesseract ${PACKAGE_VERSION}")
@ -308,6 +394,8 @@ message( STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message( STATUS "Compiler: ${CMAKE_CXX_COMPILER_ID}")
 message( STATUS "Used standard: C++${CMAKE_CXX_STANDARD}")
 message( STATUS "CXX compiler options: ${COMPILER_FLAGS}")
+get_directory_property( DirCompDefs COMPILE_DEFINITIONS)
+message( STATUS "Compile definitions = ${DirCompDefs}")
 message( STATUS "Linker options: ${CMAKE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE_UP}}")
 message( STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}")
 message( STATUS "Architecture flags: ${Vc_ARCHITECTURE_FLAGS}")
@ -317,7 +405,8 @@ message( STATUS "HAVE_AVX2: ${HAVE_AVX2}")
 message( STATUS "HAVE_FMA: ${HAVE_FMA}")
 message( STATUS "HAVE_SSE4_1: ${HAVE_SSE4_1}")
 message( STATUS "MARCH_NATIVE_OPT: ${MARCH_NATIVE_OPT}")
-message( STATUS "simd_flags: ${simd_flags}")
+message( STATUS "HAVE_NEON: ${HAVE_NEON}")
+message( STATUS "Link-time optimization: ${CMAKE_INTERPROCEDURAL_OPTIMIZATION}")
 message( STATUS "--------------------------------------------------------")
 message( STATUS "Build with sw [SW_BUILD]: ${SW_BUILD}")
 message( STATUS "Build with openmp support [OPENMP_BUILD]: ${OPENMP_BUILD}")
@ -345,20 +434,6 @@ include_directories(${LibArchive_INCLUDE_DIRS})

 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
-include_directories(include)
-include_directories(src/arch)
-include_directories(src/ccmain)
-include_directories(src/ccstruct)
-include_directories(src/ccutil)
-include_directories(src/classify)
-include_directories(src/cutil)
-include_directories(src/dict)
-include_directories(src/lstm)
-include_directories(src/opencl)
-include_directories(src/textord)
-include_directories(src/viewer)
-include_directories(src/wordrec)
-include_directories(src/training)
 if(ANDROID_TOOLCHAIN)
  include_directories(${ANDROID_TOOLCHAIN}/sysroot/usr/include)
  add_compile_definitions(__ANDROID_API_FUTURE__)
@ -382,32 +457,125 @@ file(GLOB tesseract_src
    src/wordrec/*.cpp
 )

+if (DISABLED_LEGACY_ENGINE)
+    function(prepend_path srcs path)
+        set(tmp, "")
+        foreach(src IN LISTS ${srcs})
+            list(APPEND tmp ${path}/${src})
+        endforeach(src ${srcs})
+        set(${srcs} ${tmp} PARENT_SCOPE)
+    endfunction()
+
+    SET(tesseract_src_legacy
+        src/ccmain/adaptions.cpp
+        src/ccmain/docqual.cpp
+        src/ccmain/equationdetect.cpp
+        src/ccmain/fixspace.cpp
+        src/ccmain/fixxht.cpp
+        src/ccmain/osdetect.cpp
+        src/ccmain/par_control.cpp
+        src/ccmain/recogtraining.cpp
+        src/ccmain/superscript.cpp
+        src/ccmain/tessbox.cpp
+        src/ccmain/tfacepp.cpp
+        src/ccstruct/fontinfo.cpp
+        src/ccstruct/params_training_featdef.cpp
+        src/ccutil/ambigs.cpp
+        src/ccutil/bitvector.cpp
+        src/ccutil/indexmapbidi.cpp
+        src/ccutil/universalambigs.cpp
+        src/classify/adaptive.cpp
+        src/classify/adaptmatch.cpp
+        src/classify/blobclass.cpp
+        src/classify/cluster.cpp
+        src/classify/clusttool.cpp
+        src/classify/cutoffs.cpp
+        src/classify/featdefs.cpp
+        src/classify/float2int.cpp
+        src/classify/fpoint.cpp
+        src/classify/intfeaturespace.cpp
+        src/classify/intfx.cpp
+        src/classify/intmatcher.cpp
+        src/classify/intproto.cpp
+        src/classify/kdtree.cpp
+        src/classify/mf.cpp
+        src/classify/mfdefs.cpp
+        src/classify/mfoutline.cpp
+        src/classify/mfx.cpp
+        src/classify/normfeat.cpp
+        src/classify/normmatch.cpp
+        src/classify/ocrfeatures.cpp
+        src/classify/outfeat.cpp
+        src/classify/picofeat.cpp
+        src/classify/protos.cpp
+        src/classify/shapeclassifier.cpp
+        src/classify/shapetable.cpp
+        src/classify/tessclassifier.cpp
+        src/classify/trainingsample.cpp
+        src/dict/permdawg.cpp
+        src/dict/hyphen.cpp
+        src/wordrec/associate.cpp
+        src/wordrec/chop.cpp
+        src/wordrec/chopper.cpp
+        src/wordrec/drawfx.cpp
+        src/wordrec/findseam.cpp
+        src/wordrec/gradechop.cpp
+        src/wordrec/language_model.cpp
+        src/wordrec/lm_consistency.cpp
+        src/wordrec/lm_pain_points.cpp
+        src/wordrec/lm_state.cpp
+        src/wordrec/outlines.cpp
+        src/wordrec/params_model.cpp
+        src/wordrec/pieces.cpp
+        src/wordrec/plotedges.cpp
+        src/wordrec/render.cpp
+        src/wordrec/segsearch.cpp
+        src/wordrec/wordclass.cpp
+    )
+    prepend_path(tesseract_src_legacy "${CMAKE_CURRENT_SOURCE_DIR}")
+    list(REMOVE_ITEM tesseract_src ${tesseract_src_legacy})
+endif(DISABLED_LEGACY_ENGINE)
+
 list(APPEND arch_files
    src/arch/dotproduct.cpp
    src/arch/simddetect.cpp
    src/arch/intsimdmatrix.cpp
 )
-set_source_files_properties(${arch_files} PROPERTIES COMPILE_FLAGS "${simd_flags}")
-set_source_files_properties(src/arch/dotproduct.cpp PROPERTIES COMPILE_FLAGS "${MARCH_NATIVE_FLAGS} ${Vc_CXX_FLAGS}")
+
+if(MARCH_NATIVE_FLAGS)
+    set_source_files_properties(src/arch/dotproduct.cpp
+                                PROPERTIES COMPILE_FLAGS ${MARCH_NATIVE_FLAGS})
+endif(MARCH_NATIVE_FLAGS)
 if(HAVE_AVX)
-   list(APPEND arch_files_opt src/arch/dotproductavx.cpp)
-   set_source_files_properties(src/arch/dotproductavx.cpp PROPERTIES COMPILE_FLAGS "-mavx")
+    list(APPEND arch_files_opt src/arch/dotproductavx.cpp)
+    set_source_files_properties(src/arch/dotproductavx.cpp
+                                PROPERTIES COMPILE_FLAGS ${AVX_COMPILE_FLAGS})
 endif(HAVE_AVX)
 if(HAVE_AVX2)
-   list(APPEND arch_files_opt src/arch/intsimdmatrixavx2.cpp)
-   set_source_files_properties(src/arch/intsimdmatrixavx2.cpp PROPERTIES COMPILE_FLAGS "-mavx2")
+    list(APPEND arch_files_opt src/arch/intsimdmatrixavx2.cpp src/arch/dotproductavx.cpp)
+    set_source_files_properties(src/arch/intsimdmatrixavx2.cpp
+                                PROPERTIES COMPILE_FLAGS ${AVX2_COMPILE_FLAGS})
 endif(HAVE_AVX2)
 if(HAVE_FMA)
-   list(APPEND arch_files_opt src/arch/dotproductfma.cpp)
-   set_source_files_properties(src/arch/dotproductfma.cpp PROPERTIES COMPILE_FLAGS "-mfma")
+    list(APPEND arch_files_opt src/arch/dotproductfma.cpp)
+    set_source_files_properties(src/arch/dotproductfma.cpp
+                                PROPERTIES COMPILE_FLAGS ${FMA_COMPILE_FLAGS})
 endif(HAVE_FMA)
 if(HAVE_SSE4_1)
-   list(APPEND arch_files_opt src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp)
-   set_source_files_properties(src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp PROPERTIES COMPILE_FLAGS "-msse4.1")
+    list(APPEND arch_files_opt src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp)
+    set_source_files_properties(src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp
+                                PROPERTIES COMPILE_FLAGS ${SSE4_1_COMPILE_FLAGS})
 endif(HAVE_SSE4_1)
-set_source_files_properties(${arch_files_opt} PROPERTIES COMPILE_FLAGS "${Vc_CXX_FLAGS}")
+if(HAVE_NEON)
+   list(APPEND arch_files_opt src/arch/intsimdmatrixneon.cpp)
+   if(NEON_COMPILE_FLAGS)
+       set_source_files_properties(src/arch/intsimdmatrixneon.cpp
+                                   PROPERTIES COMPILE_FLAGS ${NEON_COMPILE_FLAGS})
+   endif()
+endif(HAVE_NEON)

-file(GLOB tesseract_hdr
+file(GLOB_RECURSE tesseract_hdr
+    include/*
    src/arch/*.h
    src/ccmain/*.h
    src/ccstruct/*.h
@ -433,25 +601,35 @@ set(tesseract_src ${tesseract_src}
    src/api/wordstrboxrenderer.cpp
 )

-if (WIN32)
-    if (MSVC)
-        include_directories(src/vs2010/tesseract)
-        set(tesseract_hdr
-            ${tesseract_hdr}
-            ${CMAKE_CURRENT_SOURCE_DIR}/src/vs2010/tesseract/resource.h)
-        set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc)
-    endif()  # MSVC
-endif()
+set(libtessfiles ${tesseract_src} ${arch_files} ${arch_files_opt} ${tesseract_hdr})

-add_library                     (libtesseract ${LIBRARY_TYPE} ${tesseract_src} ${arch_files}
-    ${arch_files_opt} ${tesseract_hdr} ${tesseract_rsc}
-    )
-if (NOT STATIC)
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${libtessfiles})
+
+add_library                     (libtesseract ${libtessfiles})
+target_include_directories      (libtesseract
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    PRIVATE src
+
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/arch>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/ccmain>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/ccstruct>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/ccutil>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/classify>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/cutil>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/dict>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/lstm>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/opencl>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/textord>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/viewer>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/wordrec>
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/training>
+)
+if (BUILD_SHARED_LIBS)
 target_compile_definitions      (libtesseract
    PRIVATE -DTESS_EXPORTS
    INTERFACE -DTESS_IMPORTS
 )
-set_target_properties           (libtesseract PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS True)
+#generate_export_header          (libtesseract EXPORT_MACRO_NAME TESS_API)
 endif()
 target_link_libraries           (libtesseract PRIVATE ${LIB_Ws2_32} ${LIB_pthread})
 if(OpenMP_CXX_FOUND)
@ -472,13 +650,13 @@ if (SW_BUILD)
        org.sw.demo.libarchive.libarchive
    )
    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TesseractTargets.cmake "include(${CMAKE_CURRENT_BINARY_DIR}/cppan.cmake)\n")
-    export(TARGETS libtesseract APPEND FILE ${CMAKE_CURRENT_BINARY_DIR}/TesseractTargets.cmake)
+    export(TARGETS libtesseract APPEND FILE ${CMAKE_CURRENT_BINARY_DIR}/TesseractTargets.cmake NAMESPACE Tesseract::)
 else()
    target_link_libraries       (libtesseract PUBLIC
        ${Leptonica_LIBRARIES}
        ${LibArchive_LIBRARIES}
    )
-    export(TARGETS libtesseract FILE ${CMAKE_CURRENT_BINARY_DIR}/TesseractTargets.cmake)
+    export(TARGETS libtesseract FILE ${CMAKE_CURRENT_BINARY_DIR}/TesseractTargets.cmake NAMESPACE Tesseract::)
 endif()

 if (WIN32 AND CLANG AND OPENMP_BUILD)
@ -491,12 +669,7 @@ endif()
 # EXECUTABLE tesseractmain
 ########################################

-set(tesseractmain_src src/api/tesseractmain.cpp)
-if (MSVC)
-    set(tesseractmain_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/tesseract.rc)
-endif()
-
-add_executable                  (tesseract ${tesseractmain_src} ${tesseractmain_rsc})
+add_executable                  (tesseract src/api/tesseractmain.cpp)
 target_link_libraries           (tesseract libtesseract)
 if (HAVE_TIFFIO_H)
    target_link_libraries(tesseract tiff)
@ -523,37 +696,25 @@ configure_file(tesseract.pc.cmake ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc @ONLY
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc DESTINATION lib/pkgconfig)
 install(TARGETS tesseract RUNTIME DESTINATION bin LIBRARY DESTINATION lib ARCHIVE DESTINATION lib)
 install(TARGETS libtesseract EXPORT TesseractTargets RUNTIME DESTINATION bin LIBRARY DESTINATION lib ARCHIVE DESTINATION lib)
-install(EXPORT TesseractTargets DESTINATION cmake)
-install(FILES
-    ${CMAKE_CURRENT_BINARY_DIR}/TesseractConfig.cmake
-    ${CMAKE_CURRENT_BINARY_DIR}/TesseractConfig-version.cmake
-    DESTINATION cmake)
+install(EXPORT TesseractTargets NAMESPACE Tesseract:: DESTINATION lib/cmake/tesseract)
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cmake DESTINATION lib)

 install(FILES
-    # from api/makefile.am
-    include/tesseract/apitypes.h
    include/tesseract/baseapi.h
    include/tesseract/capi.h
    include/tesseract/renderer.h
    ${CMAKE_CURRENT_BINARY_DIR}/include/tesseract/version.h

-    #from ccmain/makefile.am
    include/tesseract/thresholder.h
    include/tesseract/ltrresultiterator.h
    include/tesseract/pageiterator.h
    include/tesseract/resultiterator.h
    include/tesseract/osdetect.h

-    #from ccstruct/makefile.am
    include/tesseract/publictypes.h

-    #from ccutil/makefile.am
-    include/tesseract/genericvector.h
-    include/tesseract/helpers.h
    include/tesseract/ocrclass.h
-    include/tesseract/platform.h
-    include/tesseract/serialis.h
-    include/tesseract/strngs.h
+    include/tesseract/export.h
    include/tesseract/unichar.h

    #${CMAKE_CURRENT_BINARY_DIR}/src/endianness.h
--- a/2
+++ b/2
@ -7,7 +7,7 @@ RUN apt-get update
 RUN DEBIAN_FRONTEND=noninteractive apt-get install -y cmake curl git ruby bundler wget unzip \
  && apt-get clean \
  && rm -rf /var/lib/apt/lists/*
-RUN gem install bundler travis --no-ri --no-rdoc
+RUN gem install bundler travis -no-ri -no-rdoc
 RUN git clone --depth 1 https://github.com/travis-ci/travis-build ~/.travis/travis-build
 RUN bundle install --gemfile ~/.travis/travis-build/Gemfile

--- a/INSTALL.GIT.md
+++ b/INSTALL.GIT.md
@ -40,8 +40,8 @@ all languages).

 You need an Internet connection and [curl](https://curl.haxx.se/) to compile `ScrollView.jar`
 because the build will automatically download
-[piccolo2d-core-3.0.jar](http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0/piccolo2d-core-3.0.jar > piccolo2d-core-3.0.jar) and
-[piccolo2d-extras-3.0.jar](http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0/piccolo2d-extras-3.0.jar) and
+[piccolo2d-core-3.0.1.jar](https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0.1/piccolo2d-core-3.0.1.jar) and
+[piccolo2d-extras-3.0.1.jar](https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0.1/piccolo2d-extras-3.0.1.jar) and
 [jaxb-api-2.3.1.jar](http://search.maven.org/remotecontent?filepath=javax/xml/bind/jaxb-api/2.3.1/jaxb-api-2.3.1.jar) and place them to `tesseract/java`.

 Just run:
--- a/Makefile.am
+++ b/Makefile.am
--- a/README.md
+++ b/README.md
@ -2,11 +2,11 @@

 [![Build Status](https://travis-ci.org/tesseract-ocr/tesseract.svg?branch=master)](https://travis-ci.org/tesseract-ocr/tesseract)
 [![Build status](https://ci.appveyor.com/api/projects/status/miah0ikfsf0j3819/branch/master?svg=true)](https://ci.appveyor.com/project/zdenop/tesseract/)
-![Build status](https://github.com/tesseract-ocr/tesseract/workflows/windows/badge.svg)<br>
+![Build status](https://github.com/tesseract-ocr/tesseract/workflows/sw/badge.svg)<br>
 [![Coverity Scan Build Status](https://scan.coverity.com/projects/tesseract-ocr/badge.svg)](https://scan.coverity.com/projects/tesseract-ocr)
 [![Code Quality: Cpp](https://img.shields.io/lgtm/grade/cpp/g/tesseract-ocr/tesseract.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/tesseract-ocr/tesseract/context:cpp)
 [![Total Alerts](https://img.shields.io/lgtm/alerts/g/tesseract-ocr/tesseract.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/tesseract-ocr/tesseract/alerts)
-[![OSS-Fuzz](https://img.shields.io/badge/oss--fuzz-fuzzing-brightgreen)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tesseract-ocr)
+[![OSS-Fuzz](https://img.shields.io/badge/oss--fuzz-fuzzing-brightgreen)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=2&q=proj:tesseract-ocr)
 <br/>
 [![GitHub license](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](https://raw.githubusercontent.com/tesseract-ocr/tesseract/master/LICENSE)
 [![Downloads](https://img.shields.io/badge/download-all%20releases-brightgreen.svg)](https://github.com/tesseract-ocr/tesseract/releases/)
@ -18,7 +18,7 @@ Tesseract 4 adds a new neural net (LSTM) based OCR engine which is focused
 on line recognition, but also still supports the legacy Tesseract OCR engine of
 Tesseract 3 which works by recognizing character patterns. Compatibility with
 Tesseract 3 is enabled by using the Legacy OCR Engine mode (--oem 0).
-It also needs traineddata files which support the legacy engine, for example
+It also needs [traineddata](https://tesseract-ocr.github.io/tessdoc/Data-Files.html) files which support the legacy engine, for example
 those from the tessdata repository.

 The lead developer is Ray Smith. The maintainer is Zdenko Podobny.
@ -61,13 +61,7 @@ and **[Change Log](https://github.com/tesseract-ocr/tesseract/blob/master/Change
 You can either [Install Tesseract via pre-built binary package](https://tesseract-ocr.github.io/tessdoc/Home.html)
 or [build it from source](https://tesseract-ocr.github.io/tessdoc/Compiling.html).

-Supported Compilers are:
-
-* GCC 4.8 and above
-* Clang 3.4 and above
-* MSVC 2015, 2017, 2019
-
-Other compilers might work, but are not officially supported.
+C++17 support is required for building.

 ## Running Tesseract

--- a/2
+++ b/2
@ -1 +1 @@
-5.0.0-alpha
+5.0.0-alpha-20201231
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit daf381e8535a1f1f1b8a75966a74e7cca63dee89
+Subproject commit b832dce8489ef7b6231384909fd9b68d5a5ff2b7
--- a/appveyor.yml
+++ b/appveyor.yml
@ -3,12 +3,12 @@ environment:
    # does not work with sw at the moment
    #- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
      #platform: Win32
-    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      platform: Win32
-    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      platform: Win64
-    #- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
+    #- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      #platform: Win32
+    #- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
      #platform: Win64
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
+      platform: Win64

 configuration:
  - Release
@ -17,18 +17,30 @@ cache:
  - c:/Users/appveyor/.sw -> appveyor.yml

 before_build:
+  - git submodule update --init --recursive
  - curl -fsS -L -o dl.zip https://github.com/SoftwareNetwork/binaries/raw/master/sw-master-windows-client.zip
  - 7z x dl.zip
  - set PATH=%PATH%;%cd%

 build_script:
  - sw -version
-  - sw -show-output -platform %platform% build
+  # -show-output - show command output
+  # debug build causes long builds (> 1h), appveyor drops them
+  - sw -platform %platform% -config r build -Dwith-tests=1
+  # test
+  - git clone https://github.com/egorpugin/tessdata tessdata_unittest
+  - ps: Copy-Item -Path "tessdata_unittest\fonts\*" -Destination "test\testing" -Recurse
+  - sw -platform %platform% -config r test -Dwith-tests=1 -Dskip-tests=lstm,lstm_recode

 after_build:
  - 7z a tesseract.zip %APPVEYOR_BUILD_FOLDER%\.sw\out\**\*.exe %APPVEYOR_BUILD_FOLDER%\.sw\out\**\*.dll
  #- 7z a tesseract.zip %APPVEYOR_BUILD_FOLDER%\.sw\Windows_*_Shared_Release_MSVC_*\*.exe %APPVEYOR_BUILD_FOLDER%\.sw\Windows_*_Shared_Release_MSVC_*\*.dll

+on_finish:
+  # gather tests
+  - ps: $wc = New-Object 'System.Net.WebClient'
+  - ps: $wc.UploadFile("https://ci.appveyor.com/api/testresults/junit/$($env:APPVEYOR_JOB_ID)", (Resolve-Path .\.sw\test\results.xml))
+
 artifacts:
  - path: tesseract.zip
    name: tesseract-$(APPVEYOR_BUILD_VERSION)
--- a/cmake/AddCompilerFlag.cmake
+++ b/cmake/AddCompilerFlag.cmake
@ -35,8 +35,8 @@
 #=============================================================================

 get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH)
-include("${_currentDir}/CheckCCompilerFlag.cmake")
-include("${_currentDir}/CheckCXXCompilerFlag.cmake")
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)

 macro(AddCompilerFlag _flag)
   string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}")
--- a/cmake/CheckCCompilerFlag.cmake
+++ b/cmake/CheckCCompilerFlag.cmake
@ -1,73 +0,0 @@
-# - Check whether the C compiler supports a given flag.
-# CHECK_C_COMPILER_FLAG(<flag> <var>)
-#  <flag> - the compiler flag
-#  <var>  - variable to store the result
-# This internally calls the check_c_source_compiles macro.
-# See help for CheckCSourceCompiles for a listing of variables
-# that can modify the build.
-
-#=============================================================================
-# Copyright 2006-2009 Kitware, Inc.
-# Copyright 2006 Alexander Neundorf <neundorf@kde.org>
-# Copyright 2011-2013 Matthias Kretz <kretz@kde.org>
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#  * Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-#  * The names of Kitware, Inc., the Insight Consortium, or the names of
-#    any consortium members, or of any contributors, may not be used to
-#    endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#=============================================================================
-
-INCLUDE(CheckCSourceCompiles)
-
-MACRO (CHECK_C_COMPILER_FLAG _FLAG _RESULT)
-   SET(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}")
-   SET(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}")
-   if(${ARGC} GREATER 2)
-      SET(TEST_SOURCE "${ARGV2}")
-   else()
-      SET(TEST_SOURCE "int main() { return 0;}")
-   endif()
-   CHECK_C_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT}
-     # Some compilers do not fail with a bad flag
-     FAIL_REGEX "error: bad value (.*) for .* switch"       # GNU
-     FAIL_REGEX "argument unused during compilation"        # clang
-     FAIL_REGEX "is valid for .* but not for C"             # GNU
-     FAIL_REGEX "unrecognized .*option"                     # GNU
-     FAIL_REGEX "ignored for target"                        # GNU
-     FAIL_REGEX "ignoring unknown option"                   # MSVC
-     FAIL_REGEX "warning D9002"                             # MSVC
-     FAIL_REGEX "[Uu]nknown option"                         # HP
-     FAIL_REGEX "[Ww]arning: [Oo]ption"                     # SunPro
-     FAIL_REGEX "command option .* is not recognized"       # XL
-     FAIL_REGEX "WARNING: unknown flag:"                    # Open64
-     FAIL_REGEX "command line error"                        # ICC
-     FAIL_REGEX "command line warning"                      # ICC
-     FAIL_REGEX "#10236:"                                   # ICC: File not found
-     FAIL_REGEX " #10159: "                                 # ICC
-     FAIL_REGEX " #10353: "                                 # ICC: option '-mfma' ignored, suggest using '-march=core-avx2'
-     )
-   SET (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}")
-ENDMACRO (CHECK_C_COMPILER_FLAG)
-
--- a/cmake/CheckCXXCompilerFlag.cmake
+++ b/cmake/CheckCXXCompilerFlag.cmake
@ -1,73 +0,0 @@
-# - Check whether the CXX compiler supports a given flag.
-# CHECK_CXX_COMPILER_FLAG(<flag> <var>)
-#  <flag> - the compiler flag
-#  <var>  - variable to store the result
-# This internally calls the check_cxx_source_compiles macro.  See help
-# for CheckCXXSourceCompiles for a listing of variables that can
-# modify the build.
-
-#=============================================================================
-# Copyright 2006-2009 Kitware, Inc.
-# Copyright 2006 Alexander Neundorf <neundorf@kde.org>
-# Copyright 2011-2013 Matthias Kretz <kretz@kde.org>
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#  * Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-#  * The names of Kitware, Inc., the Insight Consortium, or the names of
-#    any consortium members, or of any contributors, may not be used to
-#    endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#=============================================================================
-
-INCLUDE(CheckCXXSourceCompiles)
-
-MACRO (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT)
-   SET(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}")
-   SET(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}")
-   if(${ARGC} GREATER 2)
-      SET(TEST_SOURCE "${ARGV2}")
-   else()
-      SET(TEST_SOURCE "int main() { return 0;}")
-   endif()
-   CHECK_CXX_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT}
-     # Some compilers do not fail with a bad flag
-     FAIL_REGEX "error: bad value (.*) for .* switch"       # GNU
-     FAIL_REGEX "argument unused during compilation"        # clang
-     FAIL_REGEX "is valid for .* but not for C\\\\+\\\\+"   # GNU
-     FAIL_REGEX "unrecognized .*option"                     # GNU
-     FAIL_REGEX "ignored for target"                        # GNU
-     FAIL_REGEX "ignoring unknown option"                   # MSVC
-     FAIL_REGEX "warning D9002"                             # MSVC
-     FAIL_REGEX "[Uu]nknown option"                         # HP
-     FAIL_REGEX "[Ww]arning: [Oo]ption"                     # SunPro
-     FAIL_REGEX "command option .* is not recognized"       # XL
-     FAIL_REGEX "WARNING: unknown flag:"                    # Open64
-     FAIL_REGEX "command line error"                        # ICC
-     FAIL_REGEX "command line warning"                      # ICC
-     FAIL_REGEX "#10236:"                                   # ICC: File not found
-     FAIL_REGEX " #10159: "                                 # ICC
-     FAIL_REGEX " #10353: "                                 # ICC: option '-mfma' ignored, suggest using '-march=core-avx2'
-     )
-   SET (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}")
-ENDMACRO (CHECK_CXX_COMPILER_FLAG)
-
--- a/cmake/Configure.cmake
+++ b/cmake/Configure.cmake
@ -82,18 +82,12 @@ include(TestBigEndian)
 set(include_files_list
    dlfcn.h
    inttypes.h
-    limits.h
-    malloc.h
    memory.h
-    stdbool.h
    stdint.h
    stdlib.h
    string.h
-    sys/ipc.h
-    sys/shm.h
    sys/stat.h
    sys/types.h
-    sys/wait.h
    unistd.h

    cairo/cairo-version.h
@ -107,10 +101,7 @@ check_includes(include_files_list)

 set(types_list
    "long long int"
-    off_t
-    mbstate_t
    wchar_t
-    _Bool
 )
 check_types(types_list)

@ -129,14 +120,6 @@ if(TESSDATA_PREFIX)
 ")
 endif()

-test_big_endian(WORDS_BIGENDIAN)
-
-file(APPEND ${AUTOCONFIG_SRC} "
-/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
-   significant byte first (like Motorola and SPARC, unlike Intel). */
-#cmakedefine WORDS_BIGENDIAN 1
-")
-
 ########################################

 ################################################################################
--- a/cmake/templates/TesseractConfig-version.cmake.in
+++ b/cmake/templates/TesseractConfig-version.cmake.in
@ -1,14 +0,0 @@
-set(Tesseract_VERSION @VERSION_PLAIN@)
-set(PACKAGE_VERSION ${Tesseract_VERSION})
-
-set(PACKAGE_VERSION_EXACT False)
-set(PACKAGE_VERSION_COMPATIBLE False)
-
-if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
-  set(PACKAGE_VERSION_EXACT True)
-  set(PACKAGE_VERSION_COMPATIBLE True)
-endif()
-
-if(PACKAGE_FIND_VERSION VERSION_LESS PACKAGE_VERSION)
-  set(PACKAGE_VERSION_COMPATIBLE True)
-endif()
--- a/cmake/templates/TesseractConfig.cmake.in
+++ b/cmake/templates/TesseractConfig.cmake.in
@ -7,39 +7,22 @@
 #    In your CMakeLists.txt, add these lines:
 #
 #    find_package(Tesseract REQUIRED)
-#    include_directories(${Tesseract_INCLUDE_DIRS})
-#    target_link_libraries(MY_TARGET_NAME ${Tesseract_LIBRARIES})
+#    target_link_libraries(MY_TARGET_NAME Tesseract::libtesseract)
 #
 #    This file will define the following variables:
 #      - Tesseract_LIBRARIES             : The list of all imported targets for OpenCV modules.
 #      - Tesseract_INCLUDE_DIRS          : The Tesseract include directories.
-#      - Tesseract_VERSION               : The version of this Tesseract build: "@VERSION_PLAIN@"
-#      - Tesseract_VERSION_MAJOR         : Major version part of Tesseract_VERSION: "@VERSION_MAJOR@"
-#      - Tesseract_VERSION_MINOR         : Minor version part of Tesseract_VERSION: "@VERSION_MINOR@"
 #
 # ===================================================================================

+include(CMakeFindDependencyMacro)
+find_dependency(Leptonica)
+
 include(${CMAKE_CURRENT_LIST_DIR}/TesseractTargets.cmake)

-find_package(Leptonica REQUIRED)
-
-# ======================================================
-#  Version variables:
-# ======================================================
-
-SET(Tesseract_VERSION           @VERSION_PLAIN@)
-SET(Tesseract_VERSION_MAJOR     @VERSION_MAJOR@)
-SET(Tesseract_VERSION_MINOR     @VERSION_MINOR@)
-
-# ======================================================
-# Include directories to add to the user project:
-# ======================================================
-
-# Provide the include directories to the caller
-set(Tesseract_INCLUDE_DIRS @INCLUDE_DIR@)
-
-# ====================================================================
-# Link libraries:
-# ====================================================================
+@PACKAGE_INIT@

+set_and_check(Tesseract_INCLUDE_DIRS "@PACKAGE_INCLUDE_DIR@")
 set(Tesseract_LIBRARIES libtesseract)
+
+check_required_components(Tesseract)
--- a/configure.ac
+++ b/configure.ac
@ -28,8 +28,8 @@ AM_INIT_AUTOMAKE([foreign subdir-objects])

 # Define date of package, etc. Could be useful in auto-generated
 # documentation.
-PACKAGE_YEAR=2018
-PACKAGE_DATE="10/29"
+PACKAGE_YEAR=2020
+PACKAGE_DATE="12/31"

 abs_top_srcdir=`AS_DIRNAME([$0])`

@ -72,7 +72,6 @@ AC_CONFIG_HEADERS([config_auto.h:config/config.h.in])

 # default conditional
 AM_CONDITIONAL([T_WIN], false)
-AM_CONDITIONAL([OSX], false)
 AM_CONDITIONAL([GRAPHICS_DISABLED], false)
 AC_SUBST([AM_CPPFLAGS])

@ -124,29 +123,66 @@ AX_CHECK_COMPILE_FLAG([-Werror=unused-command-line-argument], [WERROR=-Werror=un

 ## Checks for supported compiler options.

-AX_CHECK_COMPILE_FLAG([-mavx], [avx=true], [avx=false], [$WERROR])
-AM_CONDITIONAL([HAVE_AVX], ${avx})
-if $avx; then
-  AC_DEFINE([HAVE_AVX], [1], [Enable AVX instructions])
-fi
+AM_CONDITIONAL([HAVE_AVX], false)
+AM_CONDITIONAL([HAVE_AVX2], false)
+AM_CONDITIONAL([HAVE_FMA], false)
+AM_CONDITIONAL([HAVE_SSE4_1], false)
+AM_CONDITIONAL([HAVE_NEON], false)

-AX_CHECK_COMPILE_FLAG([-mavx2], [avx2=true], [avx2=false], [$WERROR])
-AM_CONDITIONAL([HAVE_AVX2], $avx2)
-if $avx2; then
-  AC_DEFINE([HAVE_AVX2], [1], [Enable AVX2 instructions])
-fi
+case "${host_cpu}" in

-AX_CHECK_COMPILE_FLAG([-mfma], [fma=true], [fma=false], [$WERROR])
-AM_CONDITIONAL([HAVE_FMA], $fma)
-if $fma; then
-  AC_DEFINE([HAVE_FMA], [1], [Enable FMA instructions])
-fi
+  *86*)

-AX_CHECK_COMPILE_FLAG([-msse4.1], [sse41=true], [sse41=false], [$WERROR])
-AM_CONDITIONAL([HAVE_SSE4_1], $sse41)
-if $sse41; then
-  AC_DEFINE([HAVE_SSE4_1], [1], [Enable SSE 4.1 instructions])
-fi
+    AX_CHECK_COMPILE_FLAG([-mavx], [avx=true], [avx=false], [$WERROR])
+    AM_CONDITIONAL([HAVE_AVX], ${avx})
+    if $avx; then
+      AC_DEFINE([HAVE_AVX], [1], [Enable AVX instructions])
+    fi
+
+    AX_CHECK_COMPILE_FLAG([-mavx2], [avx2=true], [avx2=false], [$WERROR])
+    AM_CONDITIONAL([HAVE_AVX2], $avx2)
+    if $avx2; then
+      AC_DEFINE([HAVE_AVX2], [1], [Enable AVX2 instructions])
+    fi
+
+    AX_CHECK_COMPILE_FLAG([-mfma], [fma=true], [fma=false], [$WERROR])
+    AM_CONDITIONAL([HAVE_FMA], $fma)
+    if $fma; then
+      AC_DEFINE([HAVE_FMA], [1], [Enable FMA instructions])
+    fi
+
+    AX_CHECK_COMPILE_FLAG([-msse4.1], [sse41=true], [sse41=false], [$WERROR])
+    AM_CONDITIONAL([HAVE_SSE4_1], $sse41)
+    if $sse41; then
+      AC_DEFINE([HAVE_SSE4_1], [1], [Enable SSE 4.1 instructions])
+    fi
+
+    ;;
+
+  aarch64)
+
+    # ARMv8 always has NEON and does not need special compiler flags.
+    AM_CONDITIONAL([HAVE_NEON], true)
+    AC_DEFINE([HAVE_NEON], [1], [Enable NEON instructions])
+    ;;
+
+  arm*)
+
+    AX_CHECK_COMPILE_FLAG([-mfpu=neon], [neon=true], [neon=false], [$WERROR])
+    AM_CONDITIONAL([HAVE_NEON], $neon)
+    if $neon; then
+      AC_DEFINE([HAVE_NEON], [1], [Enable NEON instructions])
+      NEON_CXXFLAGS="-mfpu=neon"
+      AC_SUBST([NEON_CXXFLAGS])
+    fi
+
+    ;;
+
+  *)
+
+    AC_MSG_WARN([No compiler options for $host_cpu])
+
+esac

 AX_CHECK_COMPILE_FLAG([-march=native], [arch_native=true], [arch_native=false], [$WERROR])
 AM_CONDITIONAL([MARCH_NATIVE_OPT], $arch_native)
@ -207,11 +243,21 @@ if test "$enable_opencl" = "yes"; then
  ])
 fi

-# Check whether to build with support for TensorFlow.
+# Configure arguments which allow disabling some optional libraries.
+AC_ARG_WITH([archive],
+            AS_HELP_STRING([--with-archive],
+                           [Build with libarchive which supports compressed model files @<:@default=check@:>@]),
+            [], [with_archive=check])
+AC_ARG_WITH([curl],
+            AS_HELP_STRING([--with-curl],
+                           [Build with libcurl which supports processing an image URL @<:@default=check@:>@]),
+            [], [with_curl=check])
 AC_ARG_WITH([tensorflow],
  AS_HELP_STRING([--with-tensorflow],
                 [support TensorFlow @<:@default=check@:>@]),
  [], [with_tensorflow=check])
+
+# Check whether to build with support for TensorFlow.
 AM_CONDITIONAL([TENSORFLOW], false)
 TENSORFLOW_LIBS=
 AS_IF([test "x$with_tensorflow" != xno],
@ -280,7 +326,7 @@ case "${host_os}" in
    fi
    ;;
 esac
-AM_CONDITIONAL([USE_OPENCL], [test "$enable_opencl" = "yes"])
+AM_CONDITIONAL([OPENCL], [test "$enable_opencl" = "yes"])
 AC_SUBST([OPENCL_CPPFLAGS])
 AC_SUBST([OPENCL_LDFLAGS])

@ -338,23 +384,6 @@ else
    AM_CPPFLAGS="$AM_CPPFLAGS -O2 -DNDEBUG"
 fi

-# Always look into a "gnu" directory.
-curwd=`pwd`
-if test -d $curwd/gnu/include ; then
-   CPPFLAGS="$CPPFLAGS -I$curwd/gnu/include"
-fi
-if test -d $curwd/gnu/lib ; then
-   LDFLAGS="$LDFLAGS -L$curwd/gnu/lib"
-fi
-
-# ----------------------------------------
-# Additional checking of compiler characteristics
-# ----------------------------------------
-
-# Check Endianness. If Big Endian, this will define WORDS_BIGENDIAN
-AC_C_BIGENDIAN
-
-
 # ----------------------------------------
 # Init libtool
 # ----------------------------------------
@ -366,20 +395,18 @@ LT_INIT
 # C++ related options
 # ----------------------------------------
 dnl **********************
-dnl Turn on C++11 or newer
+dnl Turn on C++17 or newer
 dnl **********************

 CPLUSPLUS=
-AX_CHECK_COMPILE_FLAG([-std=c++11], [CPLUSPLUS=11], [], [$WERROR])
-AX_CHECK_COMPILE_FLAG([-std=c++14], [CPLUSPLUS=14], [], [$WERROR])
 AX_CHECK_COMPILE_FLAG([-std=c++17], [CPLUSPLUS=17], [], [$WERROR])
-#AX_CHECK_COMPILE_FLAG([-std=c++20], [CPLUSPLUS=20], [], [$WERROR])
+AX_CHECK_COMPILE_FLAG([-std=c++20], [CPLUSPLUS=20], [], [$WERROR])

 if test -z "$CPLUSPLUS"; then
-  AC_MSG_ERROR([Your compiler does not have the necessary C++11 support! Cannot proceed.])
+  AC_MSG_ERROR([Your compiler does not have the necessary C++17 support! Cannot proceed.])
 fi

-# Set C++11, C++14 or C++17 support based on platform/compiler
+# Set C++17 or newer support based on platform/compiler
 case "${host_os}" in
  cygwin*)
    CXXFLAGS="$CXXFLAGS -std=gnu++$CPLUSPLUS"
@ -404,29 +431,43 @@ esac
 AC_SEARCH_LIBS([pthread_create], [pthread])


-# ----------------------------------------
-# Checks for header files.
-# ----------------------------------------
-
-AC_HEADER_STDC
-AC_HEADER_TIME
-AC_HEADER_SYS_WAIT
-AC_CHECK_HEADERS([sys/ipc.h sys/shm.h])
-AC_CHECK_HEADERS([limits.h malloc.h])
-# Enable use of system-defined bool type if available:
-AC_HEADER_STDBOOL
-
 # ----------------------------------------
 # Check for programs needed to build documentation.
 # ----------------------------------------

-AC_CHECK_PROG([have_asciidoc], asciidoc, true, false)
-AC_CHECK_PROG([have_xsltproc], xsltproc, true, false)
-if $have_asciidoc && $have_xsltproc; then
-  AM_CONDITIONAL([ASCIIDOC], true)
-else
-  AM_CONDITIONAL([ASCIIDOC], false)
-fi
+AM_CONDITIONAL([ASCIIDOC], false)
+AM_CONDITIONAL([HAVE_XML_CATALOG_FILES], false)
+AC_ARG_ENABLE([doc],
+              AS_HELP_STRING([--disable-doc], [disable build of documentation])
+              [],
+              [: m4_divert_text([DEFAULTS], [enable_doc=check])])
+AS_IF([test "$enable_doc" != "no"], [
+  AC_CHECK_PROG([have_asciidoc], asciidoc, true, false)
+  AC_CHECK_PROG([have_xsltproc], xsltproc, true, false)
+  # MacOS with Homebrew requires the environment variable
+  # XML_CATALOG_FILES for xsltproc.
+  if $have_asciidoc && $have_xsltproc; then
+    AM_CONDITIONAL([ASCIIDOC], true)
+    XML_CATALOG_FILES=
+    AC_CHECK_PROG([have_brew], brew, true, false)
+    if $have_brew; then
+      brew_prefix=$(brew --prefix)
+      catalog_file=$brew_prefix/etc/xml/catalog
+      if test -f $catalog_file; then
+        AM_CONDITIONAL([HAVE_XML_CATALOG_FILES], true)
+        XML_CATALOG_FILES=file:$catalog_file
+      else
+        AC_MSG_WARN([Missing file $catalog_file.])
+      fi
+    fi
+    AC_SUBST([XML_CATALOG_FILES])
+  else
+    AS_IF([test "x$enable_doc" != xcheck], [
+      AC_MSG_FAILURE(
+        [--enable-doc was given, but test for asciidoc and xsltproc failed])
+    ])
+  fi
+])

 # ----------------------------------------
 # Checks for typedefs, structures, and compiler characteristics.
@ -434,18 +475,24 @@ fi

 AC_CHECK_TYPES([wchar_t],,, [#include "wchar.h"])
 AC_CHECK_TYPES([long long int])
-AC_CHECK_TYPES([off_t],,, [#include "sys/types.h"])
-AC_CHECK_TYPES([mbstate_t],,, [#include "wchar.h"])

 # ----------------------------------------
 # Test auxiliary packages
 # ----------------------------------------

-PKG_CHECK_MODULES([libcurl], [libcurl], [have_libcurl=true], [have_libcurl=false])
-AM_CONDITIONAL([HAVE_LIBCURL], $have_libcurl)
-if $have_libcurl; then
-  AC_DEFINE([HAVE_LIBCURL], [1], [Enable libcurl])
-fi
+AM_CONDITIONAL([HAVE_LIBCURL], false)
+AS_IF([test "x$with_curl" != xno], [
+  PKG_CHECK_MODULES([libcurl], [libcurl], [have_libcurl=true], [have_libcurl=false])
+  AM_CONDITIONAL([HAVE_LIBCURL], $have_libcurl)
+  if $have_libcurl; then
+    AC_DEFINE([HAVE_LIBCURL], [1], [Enable libcurl])
+  else
+    AS_IF([test "x$with_curl" != xcheck], [
+      AC_MSG_FAILURE(
+        [--with-curl was given, but test for libcurl failed])
+    ])
+  fi
+])

 PKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false])
 if $have_lept; then
@ -454,12 +501,20 @@ else
  AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
 fi

-PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
-AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
-if $have_libarchive; then
-  AC_DEFINE([HAVE_LIBARCHIVE], [1], [Enable libarchive])
-  CPPFLAGS="$CPPFLAGS $libarchive_CFLAGS"
-fi
+AM_CONDITIONAL([HAVE_LIBARCHIVE], false)
+AS_IF([test "x$with_archive" != xno], [
+  PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
+  AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
+  if $have_libarchive; then
+    AC_DEFINE([HAVE_LIBARCHIVE], [1], [Enable libarchive])
+    CPPFLAGS="$CPPFLAGS $libarchive_CFLAGS"
+  else
+    AS_IF([test "x$with_archive" != xcheck], [
+      AC_MSG_FAILURE(
+        [--with-archive was given, but test for libarchive failed])
+    ])
+  fi
+])

 AM_CONDITIONAL([ENABLE_TRAINING], true)

@ -503,15 +558,12 @@ AC_CONFIG_FILES([Makefile tesseract.pc])
 AC_CONFIG_FILES([tessdata/Makefile])
 AC_CONFIG_FILES([tessdata/configs/Makefile])
 AC_CONFIG_FILES([tessdata/tessconfigs/Makefile])
-AC_CONFIG_FILES([unittest/Makefile])
 AC_CONFIG_FILES([java/Makefile])
 AC_CONFIG_FILES([java/com/Makefile])
 AC_CONFIG_FILES([java/com/google/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
-AC_CONFIG_FILES([doc/Makefile])
-AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)])
 AC_OUTPUT

 # Final message
@ -524,13 +576,15 @@ echo "$ sudo make install"
 echo "$ sudo ldconfig"
 echo ""

-AM_COND_IF([ASCIIDOC],
-  [
-   echo "This will also build the documentation."
+AM_COND_IF([ASCIIDOC], [
+  echo "This will also build the documentation."
+], [
+  AS_IF([test "$enable_doc" = "no"], [
+    echo "Documentation will not be built because it was disabled."
  ], [
-   echo "Documentation will not be built because asciidoc or xsltproc is missing."
-  ]
-)
+    echo "Documentation will not be built because asciidoc or xsltproc is missing."
+  ])
+])

 # echo "$ sudo make install LANGS=\"eng ara deu\""
 # echo "  Or:"
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@ -899,7 +899,7 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.

-EXCLUDE                = ../src/vs2010
+EXCLUDE                =

 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@ -1,53 +0,0 @@
-# doc/Makefile.am
-
-if ASCIIDOC
-
-man_MANS = \
-  combine_lang_model.1 \
-  combine_tessdata.1  \
-  dawg2wordlist.1 \
-  lstmeval.1 \
-  lstmtraining.1 \
-  merge_unicharsets.1 \
-  set_unicharset_properties.1 \
-  tesseract.1 \
-  text2image.1 \
-  unicharambigs.5 \
-  unicharset_extractor.1 \
-  wordlist2dawg.1
-
-if !DISABLED_LEGACY_ENGINE
-man_MANS += \
-  ambiguous_words.1 \
-  classifier_tester.1 \
-  cntraining.1 \
-  mftraining.1 \
-  shapeclustering.1 \
-  unicharset.5
-endif
-
-man_xslt = http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl
-
-EXTRA_DIST = $(man_MANS) Doxyfile
-
-.PHONY: html
-
-html: ${man_MANS:%=%.html}
-pdf: ${man_MANS:%=%.pdf}
-
-SUFFIXES = .asc .html .pdf
-
-.asc:
-	-asciidoc -b docbook -d manpage -o - $< | \
-	xsltproc --nonet $(man_xslt) -
-
-.asc.html:
-	asciidoc -b html5 -o $@ $<
-
-.asc.pdf:
-	asciidoc -b docbook -d manpage -o $*.dbk $<
-	docbook2pdf $*.dbk
-
-MAINTAINERCLEANFILES = $(man_MANS) Doxyfile
-
-endif
--- a/doc/combine_tessdata.1.asc
+++ b/doc/combine_tessdata.1.asc
@ -66,6 +66,9 @@ OPTIONS
 *-e* '.traineddata' 'FILE'...:
    Extracts the specified components from the .traineddata file

+*-l* '.traineddata' 'FILE'...:
+   List the network information.
+
 *-o* '.traineddata' 'FILE'...:
    Overwrites the specified components of the .traineddata file
    with those provided on the command line.
--- a/doc/lstmtraining.1.asc
+++ b/doc/lstmtraining.1.asc
@ -41,7 +41,7 @@ OPTIONS
  Index in continue_from Network at which to attach the new network defined by net_spec  (type:int default:-1)

 '--max_iterations  '::
-  If set, exit after this many iterations  (type:int default:0)
+  If set, exit after this many iterations. A negative value is interpreted as epochs, 0 means infinite iterations.  (type:int default:0)

 '--target_error_rate  '::
  Final error rate in percent.  (type:double default:0.01)
--- a/doc/tesseract.1.asc
+++ b/doc/tesseract.1.asc
@ -191,9 +191,11 @@ following languages:
 *chi_sim* (Chinese simplified),
 *chi_tra* (Chinese traditional),
 *chr* (Cherokee),
+*cos* (Corsican),
 *cym* (Welsh),
 *dan* (Danish),
 *deu* (German),
+*div* (Dhivehi),
 *dzo* (Dzongkha),
 *ell* (Greek, Modern, 1453-),
 *eng* (English),
@ -203,10 +205,14 @@ following languages:
 *est* (Estonian),
 *eus* (Basque),
 *fas* (Persian),
+*fao* (Faroese),
+*fil* (Filipino),
 *fin* (Finnish),
 *fra* (French),
 *frk* (Frankish),
 *frm* (French, Middle, ca.1400-1600),
+*fry* (West Frisian),
+*gla* (Scottish Gaelic),
 *gle* (Irish),
 *glg* (Galician),
 *grc* (Greek, Ancient, to 1453),
@ -216,6 +222,7 @@ following languages:
 *hin* (Hindi),
 *hrv* (Croatian),
 *hun* (Hungarian),
+*hye* (Armenian),
 *iku* (Inuktitut),
 *ind* (Indonesian),
 *isl* (Icelandic),
@ -232,7 +239,6 @@ following languages:
 *kmr* (Kurdish Kurmanji),
 *kor* (Korean),
 *kor_vert* (Korean vertical),
-*kur* (Kurdish),
 *lao* (Lao),
 *lat* (Latin),
 *lav* (Latvian),
@ -277,7 +283,6 @@ following languages:
 *tat* (Tatar),
 *tel* (Telugu),
 *tgk* (Tajik),
-*tgl* (Tagalog),
 *tha* (Thai),
 *tir* (Tigrinya),
 *ton* (Tonga),
--- a/doc/tesseract.natvis
+++ b/doc/tesseract.natvis
@ -1,9 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
 <AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-  <Type Name="STRING">
-    <DisplayString>{(char*)data_+sizeof(int)*2,s8}</DisplayString>
-  </Type>
-
  <Type Name="GenericVector&lt;*&gt;">
    <DisplayString>{{size={size_used_}}}</DisplayString>
    <Expand>
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit a18ac392d883ca88d1849b90071cea5608fd9293
+Subproject commit 703bd9caab50b139428cea1aaff9974ebee5742e
--- a/include/tesseract/baseapi.h
+++ b/include/tesseract/baseapi.h
@ -19,52 +19,36 @@
 #ifndef TESSERACT_API_BASEAPI_H_
 #define TESSERACT_API_BASEAPI_H_

-#include <cstdio>
-#include <functional>  // for std::function
-#include <tuple>
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif

-// To avoid collision with other typenames include the ABSOLUTE MINIMUM
-// complexity of includes here. Use forward declarations wherever possible
-// and hide includes of complex types in baseapi.cpp.
-#include <tesseract/version.h>
-
-#include "apitypes.h"
+#include "export.h"
 #include "pageiterator.h"
-#include "platform.h"
 #include "publictypes.h"
 #include "resultiterator.h"
-#include "serialis.h"
 #include "thresholder.h"
 #include "unichar.h"

-template <typename T>
-class GenericVector;
-class PAGE_RES;
-class PAGE_RES_IT;
-class ParagraphModel;
-struct BlamerBundle;
-class BLOCK_LIST;
-class DENORM;
-class MATRIX;
-class ROW;
-class STRING;
-class WERD;
+#include <tesseract/version.h>
+
+#include <cstdio>
+#include <vector> // for std::vector
+#include <tuple>  // for std::tuple
+
 struct Pix;
-struct Box;
 struct Pixa;
 struct Boxa;
-class ETEXT_DESC;
-struct OSResults;
-class TBOX;
-class UNICHARSET;
-class WERD_CHOICE_LIST;
-
-struct INT_FEATURE_STRUCT;
-using INT_FEATURE = INT_FEATURE_STRUCT*;
-struct TBLOB;

 namespace tesseract {

+class PAGE_RES;
+class ParagraphModel;
+class BLOCK_LIST;
+class ETEXT_DESC;
+struct OSResults;
+class UNICHARSET;
+
 class Dawg;
 class Dict;
 class EquationDetect;
@ -74,19 +58,14 @@ class ResultIterator;
 class MutableIterator;
 class TessResultRenderer;
 class Tesseract;
-class Trie;
-class Wordrec;

-using DictFunc = int (Dict::*)(void*, const UNICHARSET&, UNICHAR_ID,
-                               bool) const;
-using ProbabilityInContextFunc = double (Dict::*)(const char*, const char*, int,
-                                                  const char*, int);
-using ParamsModelClassifyFunc = float (Dict::*)(const char*, void*);
-using FillLatticeFunc = void (Wordrec::*)(const MATRIX&,
-                                          const WERD_CHOICE_LIST&,
-                                          const UNICHARSET&, BlamerBundle*);
-using TruthCallback =
-    std::function<void(const UNICHARSET&, int, PageIterator*, Pix*)>;
+// Function to read a std::vector<char> from a whole file.
+// Returns false on failure.
+using FileReader = bool (*)(const char *filename, std::vector<char> *data);
+
+using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, bool) const;
+using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, int, const char *,
+                                                  int);

 /**
 * Base class for all tesseract APIs.
@ -97,17 +76,17 @@ using TruthCallback =
 * include any other Tesseract headers.
 */
 class TESS_API TessBaseAPI {
- public:
+public:
  TessBaseAPI();
  virtual ~TessBaseAPI();
  // Copy constructor and assignment operator are currently unsupported.
-  TessBaseAPI(TessBaseAPI const&) = delete;
-  TessBaseAPI& operator=(TessBaseAPI const&) = delete;
+  TessBaseAPI(TessBaseAPI const &) = delete;
+  TessBaseAPI &operator=(TessBaseAPI const &) = delete;

  /**
   * Returns the version identifier as a static string. Do not delete.
   */
-  static const char* Version();
+  static const char *Version();

  /**
   * If compiled with OpenCL AND an available OpenCL
@ -116,13 +95,13 @@ class TESS_API TessBaseAPI {
   * and returns sizeof(cl_device_id)
   * otherwise *device=nullptr and returns 0.
   */
-  static size_t getOpenCLDevice(void** device);
+  static size_t getOpenCLDevice(void **device);

  /**
   * Set the name of the input file. Needed for training and
   * reading a UNLV zone file, and for searchable PDF output.
   */
-  void SetInputName(const char* name);
+  void SetInputName(const char *name);
  /**
   * These functions are required for searchable PDF output.
   * We need our hands on the input file so that we can include
@ -130,15 +109,15 @@ class TESS_API TessBaseAPI {
   * we need the original image. Finally, resolution metadata
   * is stored in the PDF so we need that as well.
   */
-  const char* GetInputName();
+  const char *GetInputName();
  // Takes ownership of the input pix.
-  void SetInputImage(Pix* pix);
-  Pix* GetInputImage();
+  void SetInputImage(Pix *pix);
+  Pix *GetInputImage();
  int GetSourceYResolution();
-  const char* GetDatapath();
+  const char *GetDatapath();

  /** Set the name of the bonus output files. Needed only for debugging. */
-  void SetOutputName(const char* name);
+  void SetOutputName(const char *name);

  /**
   * Set the value of an internal "parameter."
@ -153,32 +132,32 @@ class TESS_API TessBaseAPI {
   * Note: Must be called after Init(). Only works for non-init variables
   * (init variables should be passed to Init()).
   */
-  bool SetVariable(const char* name, const char* value);
-  bool SetDebugVariable(const char* name, const char* value);
+  bool SetVariable(const char *name, const char *value);
+  bool SetDebugVariable(const char *name, const char *value);

  /**
   * Returns true if the parameter was found among Tesseract parameters.
   * Fills in value with the value of the parameter.
   */
-  bool GetIntVariable(const char* name, int* value) const;
-  bool GetBoolVariable(const char* name, bool* value) const;
-  bool GetDoubleVariable(const char* name, double* value) const;
+  bool GetIntVariable(const char *name, int *value) const;
+  bool GetBoolVariable(const char *name, bool *value) const;
+  bool GetDoubleVariable(const char *name, double *value) const;

  /**
   * Returns the pointer to the string that represents the value of the
   * parameter if it was found among Tesseract parameters.
   */
-  const char* GetStringVariable(const char* name) const;
+  const char *GetStringVariable(const char *name) const;

  /**
   * Print Tesseract parameters to the given file.
   */
-  void PrintVariables(FILE* fp) const;
+  void PrintVariables(FILE *fp) const;

  /**
   * Get value of named variable as a string, if it exists.
   */
-  bool GetVariableAsString(const char* name, STRING* val);
+  bool GetVariableAsString(const char *name, std::string *val);

  /**
   * Instances are now mostly thread-safe and totally independent,
@ -217,25 +196,21 @@ class TESS_API TessBaseAPI {
   * If set_only_non_debug_params is true, only params that do not contain
   * "debug" in the name will be set.
   */
-  int Init(const char* datapath, const char* language, OcrEngineMode mode,
-           char** configs, int configs_size,
-           const GenericVector<STRING>* vars_vec,
-           const GenericVector<STRING>* vars_values,
-           bool set_only_non_debug_params);
-  int Init(const char* datapath, const char* language, OcrEngineMode oem) {
+  int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs,
+           int configs_size, const std::vector<std::string> *vars_vec,
+           const std::vector<std::string> *vars_values, bool set_only_non_debug_params);
+  int Init(const char *datapath, const char *language, OcrEngineMode oem) {
    return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
  }
-  int Init(const char* datapath, const char* language) {
-    return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
-                false);
+  int Init(const char *datapath, const char *language) {
+    return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, false);
  }
  // In-memory version reads the traineddata file directly from the given
  // data[data_size] array, and/or reads data via a FileReader.
-  int Init(const char* data, int data_size, const char* language,
-           OcrEngineMode mode, char** configs, int configs_size,
-           const GenericVector<STRING>* vars_vec,
-           const GenericVector<STRING>* vars_values,
-           bool set_only_non_debug_params, FileReader reader);
+  int Init(const char *data, int data_size, const char *language, OcrEngineMode mode,
+           char **configs, int configs_size, const std::vector<std::string> *vars_vec,
+           const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
+           FileReader reader);

  /**
   * Returns the languages string used in the last valid initialization.
@ -245,19 +220,19 @@ class TESS_API TessBaseAPI {
   * loaded use GetLoadedLanguagesAsVector.
   * The returned string should NOT be deleted.
   */
-  const char* GetInitLanguagesAsString() const;
+  const char *GetInitLanguagesAsString() const;

  /**
-   * Returns the loaded languages in the vector of STRINGs.
+   * Returns the loaded languages in the vector of std::string.
   * Includes all languages loaded by the last Init, including those loaded
   * as dependencies of other loaded languages.
   */
-  void GetLoadedLanguagesAsVector(GenericVector<STRING>* langs) const;
+  void GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const;

  /**
-   * Returns the available languages in the sorted vector of STRINGs.
+   * Returns the available languages in the sorted vector of std::string.
   */
-  void GetAvailableLanguagesAsVector(GenericVector<STRING>* langs) const;
+  void GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const;

  /**
   * Init only the lang model component of Tesseract. The only functions
@ -265,7 +240,7 @@ class TESS_API TessBaseAPI {
   * WARNING: temporary! This function will be removed from here and placed
   * in a separate API at some future time.
   */
-  int InitLangMod(const char* datapath, const char* language);
+  int InitLangMod(const char *datapath, const char *language);

  /**
   * Init only for page layout analysis. Use only for calls to SetImage and
@ -279,9 +254,9 @@ class TESS_API TessBaseAPI {
   * and also accepts a relative or absolute path name.
   * Note: only non-init params will be set (init params are set by Init()).
   */
-  void ReadConfigFile(const char* filename);
+  void ReadConfigFile(const char *filename);
  /** Same as above, but only set debug params from the given config file. */
-  void ReadDebugConfigFile(const char* filename);
+  void ReadDebugConfigFile(const char *filename);

  /**
   * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
@ -310,9 +285,8 @@ class TESS_API TessBaseAPI {
   * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
   * and one or more of the Get*Text functions below.
   */
-  char* TesseractRect(const unsigned char* imagedata, int bytes_per_pixel,
-                      int bytes_per_line, int left, int top, int width,
-                      int height);
+  char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, int bytes_per_line,
+                      int left, int top, int width, int height);

  /**
   * Call between pages or documents etc to free up memory and forget
@ -335,8 +309,8 @@ class TESS_API TessBaseAPI {
   * full image, so it may be followed immediately by a GetUTF8Text, and it
   * will automatically perform recognition.
   */
-  void SetImage(const unsigned char* imagedata, int width, int height,
-                int bytes_per_pixel, int bytes_per_line);
+  void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel,
+                int bytes_per_line);

  /**
   * Provide an image for Tesseract to recognize. As with SetImage above,
@ -346,7 +320,7 @@ class TESS_API TessBaseAPI {
   * Use Pix where possible. Tesseract uses Pix as its internal representation
   * and it is therefore more efficient to provide a Pix directly.
   */
-  void SetImage(Pix* pix);
+  void SetImage(Pix *pix);

  /**
   * Set the resolution of the source image in pixels per inch so font size
@ -368,7 +342,7 @@ class TESS_API TessBaseAPI {
   * Note that Tesseract takes ownership of the Thresholder and will
   * delete it when it it is replaced or the API is destructed.
   */
-  void SetThresholder(ImageThresholder* thresholder) {
+  void SetThresholder(ImageThresholder *thresholder) {
    delete thresholder_;
    thresholder_ = thresholder;
    ClearResults();
@ -379,14 +353,14 @@ class TESS_API TessBaseAPI {
   * Caller takes ownership of the Pix and must pixDestroy it.
   * May be called any time after SetImage, or after TesseractRect.
   */
-  Pix* GetThresholdedImage();
+  Pix *GetThresholdedImage();

  /**
   * Get the result of page layout analysis as a leptonica-style
   * Boxa, Pixa pair, in reading order.
   * Can be called before or after Recognize.
   */
-  Boxa* GetRegions(Pixa** pixa);
+  Boxa *GetRegions(Pixa **pixa);

  /**
   * Get the textlines as a leptonica-style
@ -399,12 +373,11 @@ class TESS_API TessBaseAPI {
   * nullptr, the paragraph-id of each line within its block is also returned as
   * an array of one element per line. delete [] after use.
   */
-  Boxa* GetTextlines(bool raw_image, int raw_padding, Pixa** pixa,
-                     int** blockids, int** paraids);
+  Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids);
  /*
-     Helper method to extract from the thresholded image. (most common usage)
-  */
-  Boxa* GetTextlines(Pixa** pixa, int** blockids) {
+   Helper method to extract from the thresholded image. (most common usage)
+*/
+  Boxa *GetTextlines(Pixa **pixa, int **blockids) {
    return GetTextlines(false, 0, pixa, blockids, nullptr);
  }

@ -416,14 +389,14 @@ class TESS_API TessBaseAPI {
   * If blockids is not nullptr, the block-id of each line is also returned as
   * an array of one element per line. delete [] after use.
   */
-  Boxa* GetStrips(Pixa** pixa, int** blockids);
+  Boxa *GetStrips(Pixa **pixa, int **blockids);

  /**
   * Get the words as a leptonica-style
   * Boxa, Pixa pair, in reading order.
   * Can be called before or after Recognize.
   */
-  Boxa* GetWords(Pixa** pixa);
+  Boxa *GetWords(Pixa **pixa);

  /**
   * Gets the individual connected (text) components (created
@ -433,7 +406,7 @@ class TESS_API TessBaseAPI {
   * Note: the caller is responsible for calling boxaDestroy()
   * on the returned Boxa array and pixaDestroy() on cc array.
   */
-  Boxa* GetConnectedComponents(Pixa** cc);
+  Boxa *GetConnectedComponents(Pixa **cc);

  /**
   * Get the given level kind of components (block, textline, word etc.) as a
@ -447,14 +420,12 @@ class TESS_API TessBaseAPI {
   * extracted instead of the thresholded image and padded with raw_padding. If
   * text_only is true, then only text components are returned.
   */
-  Boxa* GetComponentImages(PageIteratorLevel level, bool text_only,
-                           bool raw_image, int raw_padding, Pixa** pixa,
-                           int** blockids, int** paraids);
+  Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding,
+                           Pixa **pixa, int **blockids, int **paraids);
  // Helper function to get binary images with no padding (most common usage).
-  Boxa* GetComponentImages(const PageIteratorLevel level, const bool text_only,
-                           Pixa** pixa, int** blockids) {
-    return GetComponentImages(level, text_only, false, 0, pixa, blockids,
-                              nullptr);
+  Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, Pixa **pixa,
+                           int **blockids) {
+    return GetComponentImages(level, text_only, false, 0, pixa, blockids, nullptr);
  }

  /**
@ -480,8 +451,8 @@ class TESS_API TessBaseAPI {
   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
   * DetectOS, or anything else that changes the internal PAGE_RES.
   */
-  PageIterator* AnalyseLayout();
-  PageIterator* AnalyseLayout(bool merge_similar_words);
+  PageIterator *AnalyseLayout();
+  PageIterator *AnalyseLayout(bool merge_similar_words);

  /**
   * Recognize the image from SetAndThresholdImage, generating Tesseract
@ -489,18 +460,13 @@ class TESS_API TessBaseAPI {
   * Optional. The Get*Text functions below will call Recognize if needed.
   * After Recognize, the output is kept internally until the next SetImage.
   */
-  int Recognize(ETEXT_DESC* monitor);
+  int Recognize(ETEXT_DESC *monitor);

  /**
   * Methods to retrieve information after SetAndThresholdImage(),
   * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
   */

-#ifndef DISABLED_LEGACY_ENGINE
-  /** Variant on Recognize used for testing chopper. */
-  int RecognizeForChopTest(ETEXT_DESC* monitor);
-#endif
-
  /**
   * Turns images into symbolic text.
   *
@ -523,11 +489,11 @@ class TESS_API TessBaseAPI {
   *
   * Returns true if successful, false on error.
   */
-  bool ProcessPages(const char* filename, const char* retry_config,
-                    int timeout_millisec, TessResultRenderer* renderer);
+  bool ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,
+                    TessResultRenderer *renderer);
  // Does the real work of ProcessPages.
-  bool ProcessPagesInternal(const char* filename, const char* retry_config,
-                            int timeout_millisec, TessResultRenderer* renderer);
+  bool ProcessPagesInternal(const char *filename, const char *retry_config, int timeout_millisec,
+                            TessResultRenderer *renderer);

  /**
   * Turn a single image into symbolic text.
@ -538,9 +504,8 @@ class TESS_API TessBaseAPI {
   *
   * See ProcessPages for desciptions of other parameters.
   */
-  bool ProcessPage(Pix* pix, int page_index, const char* filename,
-                   const char* retry_config, int timeout_millisec,
-                   TessResultRenderer* renderer);
+  bool ProcessPage(Pix *pix, int page_index, const char *filename, const char *retry_config,
+                   int timeout_millisec, TessResultRenderer *renderer);

  /**
   * Get a reading-order iterator to the results of LayoutAnalysis and/or
@ -550,7 +515,7 @@ class TESS_API TessBaseAPI {
   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
   * DetectOS, or anything else that changes the internal PAGE_RES.
   */
-  ResultIterator* GetIterator();
+  ResultIterator *GetIterator();

  /**
   * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
@ -560,13 +525,13 @@ class TESS_API TessBaseAPI {
   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
   * DetectOS, or anything else that changes the internal PAGE_RES.
   */
-  MutableIterator* GetMutableIterator();
+  MutableIterator *GetMutableIterator();

  /**
   * The recognized text is returned as a char* which is coded
   * as UTF8 and must be freed with the delete [] operator.
   */
-  char* GetUTF8Text();
+  char *GetUTF8Text();
  
  size_t GetNumberOfTables();
  
@ -584,6 +549,7 @@ class TESS_API TessBaseAPI {
  std::vector<std::tuple<int,int,int,int> > GetTableCols(
    unsigned i///<Table index needs to be lesser than GetNumberOfTables()
  );
+
  /**
   * Make a HTML-formatted string with hOCR markup from the internal
   * data structures.
@ -593,7 +559,7 @@ class TESS_API TessBaseAPI {
   *  receive progress callbacks
   * Returned string must be freed with the delete [] operator.
   */
-  char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
+  char *GetHOCRText(ETEXT_DESC *monitor, int page_number);

  /**
   * Make a HTML-formatted string with hOCR markup from the internal
@ -601,26 +567,26 @@ class TESS_API TessBaseAPI {
   * page_number is 0-based but will appear in the output as 1-based.
   * Returned string must be freed with the delete [] operator.
   */
-  char* GetHOCRText(int page_number);
+  char *GetHOCRText(int page_number);

  /**
   * Make an XML-formatted string with Alto markup from the internal
   * data structures.
   */
-  char* GetAltoText(ETEXT_DESC* monitor, int page_number);
+  char *GetAltoText(ETEXT_DESC *monitor, int page_number);

  /**
   * Make an XML-formatted string with Alto markup from the internal
   * data structures.
   */
-  char* GetAltoText(int page_number);
+  char *GetAltoText(int page_number);

  /**
   * Make a TSV-formatted string from the internal data structures.
   * page_number is 0-based but will appear in the output as 1-based.
   * Returned string must be freed with the delete [] operator.
   */
-  char* GetTSVText(int page_number);
+  char *GetTSVText(int page_number);

  /**
   * Make a box file for LSTM training from the internal data structures.
@ -628,7 +594,7 @@ class TESS_API TessBaseAPI {
   * page_number is a 0-based page index that will appear in the box file.
   * Returned string must be freed with the delete [] operator.
   */
-  char* GetLSTMBoxText(int page_number);
+  char *GetLSTMBoxText(int page_number);

  /**
   * The recognized text is returned as a char* which is coded in the same
@ -637,7 +603,7 @@ class TESS_API TessBaseAPI {
   * page_number is a 0-based page index that will appear in the box file.
   * Returned string must be freed with the delete [] operator.
   */
-  char* GetBoxText(int page_number);
+  char *GetBoxText(int page_number);

  /**
   * The recognized text is returned as a char* which is coded in the same
@ -645,14 +611,14 @@ class TESS_API TessBaseAPI {
   * page_number is a 0-based page index that will appear in the box file.
   * Returned string must be freed with the delete [] operator.
   */
-  char* GetWordStrBoxText(int page_number);
+  char *GetWordStrBoxText(int page_number);

  /**
   * The recognized text is returned as a char* which is coded
   * as UNLV format Latin-1 with specific reject and suspect codes.
   * Returned string must be freed with the delete [] operator.
   */
-  char* GetUNLVText();
+  char *GetUNLVText();

  /**
   * Detect the orientation of the input image and apparent script (alphabet).
@ -663,15 +629,15 @@ class TESS_API TessBaseAPI {
   * script_conf is confidence level in the script
   * Returns true on success and writes values to each parameter as an output
   */
-  bool DetectOrientationScript(int* orient_deg, float* orient_conf,
-                               const char** script_name, float* script_conf);
+  bool DetectOrientationScript(int *orient_deg, float *orient_conf, const char **script_name,
+                               float *script_conf);

  /**
   * The recognized text is returned as a char* which is coded
   * as UTF8 and must be freed with the delete [] operator.
   * page_number is a 0-based page index that will appear in the osd file.
   */
-  char* GetOsdText(int page_number);
+  char *GetOsdText(int page_number);

  /** Returns the (average) confidence value between 0 and 100. */
  int MeanTextConf();
@ -681,7 +647,7 @@ class TESS_API TessBaseAPI {
   * The number of confidences should correspond to the number of space-
   * delimited words in GetUTF8Text.
   */
-  int* AllWordConfidences();
+  int *AllWordConfidences();

 #ifndef DISABLED_LEGACY_ENGINE
  /**
@ -694,8 +660,8 @@ class TESS_API TessBaseAPI {
   * The currently set PageSegMode is preserved.
   * Returns false if adaption was not possible for some reason.
   */
-  bool AdaptToWordStr(PageSegMode mode, const char* wordstr);
-#endif  //  ndef DISABLED_LEGACY_ENGINE
+  bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
+#endif //  ndef DISABLED_LEGACY_ENGINE

  /**
   * Free up recognition results and any stored image data, without actually
@ -728,11 +694,11 @@ class TESS_API TessBaseAPI {
   * @warning temporary! This function will be removed from here and placed
   * in a separate API at some future time.
   */
-  int IsValidWord(const char* word);
+  int IsValidWord(const char *word);
  // Returns true if utf8_character is defined in the UniCharset.
-  bool IsValidCharacter(const char* utf8_character);
+  bool IsValidCharacter(const char *utf8_character);

-  bool GetTextDirection(int* out_offset, float* out_slope);
+  bool GetTextDirection(int *out_offset, float *out_slope);

  /** Sets Dict::letter_is_okay_ function to point to the given function. */
  void SetDictFunc(DictFunc f);
@ -746,73 +712,24 @@ class TESS_API TessBaseAPI {
   * Estimates the Orientation And Script of the image.
   * @return true if the image was processed successfully.
   */
-  bool DetectOS(OSResults*);
+  bool DetectOS(OSResults *);

  /**
   * Return text orientation of each block as determined by an earlier run
   * of layout analysis.
   */
-  void GetBlockTextOrientations(int** block_orientation,
-                                bool** vertical_writing);
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-  /** Sets Wordrec::fill_lattice_ function to point to the given function. */
-  void SetFillLatticeFunc(FillLatticeFunc f);
-
-  /** Find lines from the image making the BLOCK_LIST. */
-  BLOCK_LIST* FindLinesCreateBlockList();
-
-  /**
-   * Delete a block list.
-   * This is to keep BLOCK_LIST pointer opaque
-   * and let go of including the other headers.
-   */
-  static void DeleteBlockList(BLOCK_LIST* block_list);
-
-  /** Returns a ROW object created from the input row specification. */
-  static ROW* MakeTessOCRRow(float baseline, float xheight, float descender,
-                             float ascender);
-
-  /** Returns a TBLOB corresponding to the entire input image. */
-  static TBLOB* MakeTBLOB(Pix* pix);
-
-  /**
-   * This method baseline normalizes a TBLOB in-place. The input row is used
-   * for normalization. The denorm is an optional parameter in which the
-   * normalization-antidote is returned.
-   */
-  static void NormalizeTBLOB(TBLOB* tblob, ROW* row, bool numeric_mode);
-
-  /** This method returns the features associated with the input image. */
-  void GetFeaturesForBlob(TBLOB* blob, INT_FEATURE_STRUCT* int_features,
-                          int* num_features, int* feature_outline_index);
-
-  /**
-   * This method returns the row to which a box of specified dimensions would
-   * belong. If no good match is found, it returns nullptr.
-   */
-  static ROW* FindRowForBox(BLOCK_LIST* blocks, int left, int top, int right,
-                            int bottom);
-
-  /**
-   * Method to run adaptive classifier on a blob.
-   * It returns at max num_max_matches results.
-   */
-  void RunAdaptiveClassifier(TBLOB* blob, int num_max_matches, int* unichar_ids,
-                             float* ratings, int* num_matches_returned);
-#endif  // ndef DISABLED_LEGACY_ENGINE
+  void GetBlockTextOrientations(int **block_orientation, bool **vertical_writing);

  /** This method returns the string form of the specified unichar. */
-  const char* GetUnichar(int unichar_id);
+  const char *GetUnichar(int unichar_id);

  /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
-  const Dawg* GetDawg(int i) const;
+  const Dawg *GetDawg(int i) const;

  /** Return the number of dawgs loaded into tesseract_ object. */
  int NumDawgs() const;

-  Tesseract* tesseract() const {
+  Tesseract *tesseract() const {
    return tesseract_;
  }

@ -820,29 +737,25 @@ class TESS_API TessBaseAPI {
    return last_oem_requested_;
  }

-  void InitTruthCallback(TruthCallback cb) {
-    truth_cb_ = cb;
-  }
-
  void set_min_orientation_margin(double margin);
  /* @} */

- protected:
+protected:
  /** Common code for setting the image. Returns true if Init has been called.
   */
-  TESS_LOCAL bool InternalSetImage();
+  bool InternalSetImage();

  /**
   * Run the thresholder to make the thresholded image. If pix is not nullptr,
   * the source is thresholded to pix instead of the internal IMAGE.
   */
-  TESS_LOCAL virtual bool Threshold(Pix** pix);
+  virtual bool Threshold(Pix **pix);

  /**
   * Find lines from the image making the BLOCK_LIST.
   * @return 0 on success.
   */
-  TESS_LOCAL int FindLines();
+  int FindLines();

  /** Delete the pageres and block list ready for a new page. */
  void ClearResults();
@ -852,7 +765,7 @@ class TESS_API TessBaseAPI {
   * to ignore all BiDi smarts at that point.
   * delete once you're done with it.
   */
-  TESS_LOCAL LTRResultIterator* GetLTRIterator();
+  LTRResultIterator *GetLTRIterator();

  /**
   * Return the length of the output text string, as UTF8, assuming
@ -860,61 +773,30 @@ class TESS_API TessBaseAPI {
   * and assuming a single character reject marker for each rejected character.
   * Also return the number of recognized blobs in blob_count.
   */
-  TESS_LOCAL int TextLength(int* blob_count);
+  int TextLength(int *blob_count);

  //// paragraphs.cpp ////////////////////////////////////////////////////
-  TESS_LOCAL void DetectParagraphs(bool after_text_recognition);
+  void DetectParagraphs(bool after_text_recognition);

-#ifndef DISABLED_LEGACY_ENGINE
-
-  /** @defgroup ocropusAddOns ocropus add-ons */
-  /* @{ */
-
-  /**
-   * Adapt to recognize the current image as the given character.
-   * The image must be preloaded and be just an image of a single character.
-   */
-  TESS_LOCAL void AdaptToCharacter(const char* unichar_repr, int length,
-                                   float baseline, float xheight,
-                                   float descender, float ascender);
-
-  /** Recognize text doing one pass only, using settings for a given pass. */
-  TESS_LOCAL PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list);
-
-  TESS_LOCAL PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list,
-                                        PAGE_RES* pass1_result);
-
-  /**
-   * Extract the OCR results, costs (penalty points for uncertainty),
-   * and the bounding boxes of the characters.
-   */
-  TESS_LOCAL static int TesseractExtractResult(char** text, int** lengths,
-                                               float** costs, int** x0,
-                                               int** y0, int** x1, int** y1,
-                                               PAGE_RES* page_res);
-
-  TESS_LOCAL const PAGE_RES* GetPageRes() const {
+  const PAGE_RES *GetPageRes() const {
    return page_res_;
  }
-  /* @} */
-#endif  // ndef DISABLED_LEGACY_ENGINE

- protected:
-  Tesseract* tesseract_;           ///< The underlying data object.
-  Tesseract* osd_tesseract_;       ///< For orientation & script detection.
-  EquationDetect* equ_detect_;     ///< The equation detector.
-  FileReader reader_;              ///< Reads files from any filesystem.
-  ImageThresholder* thresholder_;  ///< Image thresholding module.
-  GenericVector<ParagraphModel*>* paragraph_models_;
-  BLOCK_LIST* block_list_;            ///< The page layout.
-  PAGE_RES* page_res_;                ///< The page-level data.
-  STRING* input_file_;                ///< Name used by training code.
-  STRING* output_file_;               ///< Name used by debug code.
-  STRING* datapath_;                  ///< Current location of tessdata.
-  STRING* language_;                  ///< Last initialized language.
-  OcrEngineMode last_oem_requested_;  ///< Last ocr language mode requested.
-  bool recognition_done_;             ///< page_res_ contains recognition data.
-  TruthCallback truth_cb_;            ///< fxn for setting truth_* in WERD_RES
+protected:
+  Tesseract *tesseract_;          ///< The underlying data object.
+  Tesseract *osd_tesseract_;      ///< For orientation & script detection.
+  EquationDetect *equ_detect_;    ///< The equation detector.
+  FileReader reader_;             ///< Reads files from any filesystem.
+  ImageThresholder *thresholder_; ///< Image thresholding module.
+  std::vector<ParagraphModel *> *paragraph_models_;
+  BLOCK_LIST *block_list_;           ///< The page layout.
+  PAGE_RES *page_res_;               ///< The page-level data.
+  std::string input_file_;           ///< Name used by training code.
+  std::string output_file_;          ///< Name used by debug code.
+  std::string datapath_;             ///< Current location of tessdata.
+  std::string language_;             ///< Last initialized language.
+  OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
+  bool recognition_done_;            ///< page_res_ contains recognition data.

  /**
   * @defgroup ThresholderParams Thresholder Parameters
@ -929,21 +811,20 @@ class TESS_API TessBaseAPI {
  int image_height_;
  /* @} */

- private:
+private:
  // A list of image filenames gets special consideration
-  bool ProcessPagesFileList(FILE* fp, STRING* buf, const char* retry_config,
-                            int timeout_millisec, TessResultRenderer* renderer,
+  bool ProcessPagesFileList(FILE *fp, std::string *buf, const char *retry_config,
+                            int timeout_millisec, TessResultRenderer *renderer,
                            int tessedit_page_number);
  // TIFF supports multipage so gets special consideration.
-  bool ProcessPagesMultipageTiff(const unsigned char* data, size_t size,
-                                 const char* filename, const char* retry_config,
-                                 int timeout_millisec,
-                                 TessResultRenderer* renderer,
-                                 int tessedit_page_number);
-};  // class TessBaseAPI.
+  bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, const char *filename,
+                                 const char *retry_config, int timeout_millisec,
+                                 TessResultRenderer *renderer, int tessedit_page_number);
+}; // class TessBaseAPI.

 /** Escape a char string - remove &<>"' with HTML codes. */
-STRING HOcrEscape(const char* text);
-}  // namespace tesseract.
+std::string HOcrEscape(const char *text);

-#endif  // TESSERACT_API_BASEAPI_H_
+} // namespace tesseract
+
+#endif // TESSERACT_API_BASEAPI_H_
--- a/include/tesseract/capi.h
+++ b/include/tesseract/capi.h
@ -18,22 +18,18 @@
 #ifndef API_CAPI_H_
 #define API_CAPI_H_

-#if defined(TESSERACT_API_BASEAPI_H_) && !defined(TESS_CAPI_INCLUDE_BASEAPI)
-#  define TESS_CAPI_INCLUDE_BASEAPI
+#include "export.h"
+
+#ifdef __cplusplus
+#  include <tesseract/baseapi.h>
+#  include <tesseract/ocrclass.h>
+#  include <tesseract/pageiterator.h>
+#  include <tesseract/renderer.h>
+#  include <tesseract/resultiterator.h>
 #endif

-#ifdef TESS_CAPI_INCLUDE_BASEAPI
-#  include "baseapi.h"
-#  include "ocrclass.h"
-#  include "pageiterator.h"
-#  include "renderer.h"
-#  include "resultiterator.h"
-#else
-#  include <stdbool.h>
-#  include <stdio.h>
-
-#  include "platform.h"
-#endif
+#include <stdbool.h>
+#include <stdio.h>

 #ifdef __cplusplus
 extern "C" {
@ -45,7 +41,7 @@ extern "C" {
 #  define FALSE 0
 #endif

-#ifdef TESS_CAPI_INCLUDE_BASEAPI
+#ifdef __cplusplus
 typedef tesseract::TessResultRenderer TessResultRenderer;
 typedef tesseract::TessBaseAPI TessBaseAPI;
 typedef tesseract::PageIterator TessPageIterator;
@ -54,19 +50,13 @@ typedef tesseract::MutableIterator TessMutableIterator;
 typedef tesseract::ChoiceIterator TessChoiceIterator;
 typedef tesseract::OcrEngineMode TessOcrEngineMode;
 typedef tesseract::PageSegMode TessPageSegMode;
-typedef tesseract::ImageThresholder TessImageThresholder;
 typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
-typedef tesseract::DictFunc TessDictFunc;
-typedef tesseract::ProbabilityInContextFunc TessProbabilityInContextFunc;
-// typedef tesseract::ParamsModelClassifyFunc TessParamsModelClassifyFunc;
-typedef tesseract::FillLatticeFunc TessFillLatticeFunc;
-typedef tesseract::Dawg TessDawg;
-typedef tesseract::TruthCallback TessTruthCallback;
 typedef tesseract::Orientation TessOrientation;
 typedef tesseract::ParagraphJustification TessParagraphJustification;
 typedef tesseract::WritingDirection TessWritingDirection;
 typedef tesseract::TextlineOrder TessTextlineOrder;
-typedef PolyBlockType TessPolyBlockType;
+typedef tesseract::PolyBlockType TessPolyBlockType;
+typedef tesseract::ETEXT_DESC ETEXT_DESC;
 #else
 typedef struct TessResultRenderer TessResultRenderer;
 typedef struct TessBaseAPI TessBaseAPI;
@ -147,9 +137,8 @@ typedef enum TessTextlineOrder {
 typedef struct ETEXT_DESC ETEXT_DESC;
 #endif

-typedef bool (*TessCancelFunc)(void* cancel_this, int words);
-typedef bool (*TessProgressFunc)(ETEXT_DESC* ths, int left, int right, int top,
-                                 int bottom);
+typedef bool (*TessCancelFunc)(void *cancel_this, int words);
+typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top, int bottom);

 struct Pix;
 struct Boxa;
@ -157,409 +146,274 @@ struct Pixa;

 /* General free functions */

-TESS_API const char* TessVersion();
-TESS_API void TessDeleteText(const char* text);
-TESS_API void TessDeleteTextArray(char** arr);
-TESS_API void TessDeleteIntArray(const int* arr);
+TESS_API const char *TessVersion();
+TESS_API void TessDeleteText(const char *text);
+TESS_API void TessDeleteTextArray(char **arr);
+TESS_API void TessDeleteIntArray(const int *arr);

 /* Renderer API */
-TESS_API TessResultRenderer* TessTextRendererCreate(const char* outputbase);
-TESS_API TessResultRenderer* TessHOcrRendererCreate(const char* outputbase);
-TESS_API TessResultRenderer* TessHOcrRendererCreate2(const char* outputbase,
-                                                     BOOL font_info);
-TESS_API TessResultRenderer* TessAltoRendererCreate(const char* outputbase);
-TESS_API TessResultRenderer* TessTsvRendererCreate(const char* outputbase);
-TESS_API TessResultRenderer* TessPDFRendererCreate(const char* outputbase,
-                                                   const char* datadir,
+TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, BOOL font_info);
+TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase, const char *datadir,
                                                   BOOL textonly);
-TESS_API TessResultRenderer* TessUnlvRendererCreate(const char* outputbase);
-TESS_API TessResultRenderer* TessBoxTextRendererCreate(const char* outputbase);
-TESS_API TessResultRenderer* TessLSTMBoxRendererCreate(const char* outputbase);
-TESS_API TessResultRenderer* TessWordStrBoxRendererCreate(
-    const char* outputbase);
+TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(const char *outputbase);

-TESS_API void TessDeleteResultRenderer(TessResultRenderer* renderer);
-TESS_API void TessResultRendererInsert(TessResultRenderer* renderer,
-                                       TessResultRenderer* next);
-TESS_API TessResultRenderer* TessResultRendererNext(
-    TessResultRenderer* renderer);
-TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer* renderer,
-                                              const char* title);
-TESS_API BOOL TessResultRendererAddImage(TessResultRenderer* renderer,
-                                         TessBaseAPI* api);
-TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer* renderer);
+TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
+TESS_API void TessResultRendererInsert(TessResultRenderer *renderer, TessResultRenderer *next);
+TESS_API TessResultRenderer *TessResultRendererNext(TessResultRenderer *renderer);
+TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, const char *title);
+TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer, TessBaseAPI *api);
+TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);

-TESS_API const char* TessResultRendererExtention(TessResultRenderer* renderer);
-TESS_API const char* TessResultRendererTitle(TessResultRenderer* renderer);
-TESS_API int TessResultRendererImageNum(TessResultRenderer* renderer);
+TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
+TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
+TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);

 /* Base API */

-TESS_API TessBaseAPI* TessBaseAPICreate();
-TESS_API void TessBaseAPIDelete(TessBaseAPI* handle);
+TESS_API TessBaseAPI *TessBaseAPICreate();
+TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);

-TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI* handle, void** device);
+TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);

-TESS_API void TessBaseAPISetInputName(TessBaseAPI* handle, const char* name);
-TESS_API const char* TessBaseAPIGetInputName(TessBaseAPI* handle);
+TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
+TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);

-TESS_API void TessBaseAPISetInputImage(TessBaseAPI* handle, struct Pix* pix);
-TESS_API struct Pix* TessBaseAPIGetInputImage(TessBaseAPI* handle);
+TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
+TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);

-TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI* handle);
-TESS_API const char* TessBaseAPIGetDatapath(TessBaseAPI* handle);
+TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
+TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);

-TESS_API void TessBaseAPISetOutputName(TessBaseAPI* handle, const char* name);
+TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);

-TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI* handle, const char* name,
-                                     const char* value);
-TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI* handle, const char* name,
-                                          const char* value);
+TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, const char *value);
+TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, const char *value);

-TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI* handle,
-                                        const char* name, int* value);
-TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI* handle,
-                                         const char* name, BOOL* value);
-TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI* handle,
-                                           const char* name, double* value);
-TESS_API const char* TessBaseAPIGetStringVariable(const TessBaseAPI* handle,
-                                                  const char* name);
+TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, const char *name, int *value);
+TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, const char *name, BOOL *value);
+TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, const char *name,
+                                           double *value);
+TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, const char *name);

-TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI* handle, FILE* fp);
-TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI* handle,
-                                              const char* filename);
+TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
+TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, const char *filename);

-#ifdef TESS_CAPI_INCLUDE_BASEAPI
+TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, const char *language,
+                              TessOcrEngineMode oem, char **configs, int configs_size);
+TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, const char *language,
+                              TessOcrEngineMode oem);
+TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, const char *language);

-TESS_API BOOL TessBaseAPIGetVariableAsString(TessBaseAPI* handle,
-                                             const char* name, STRING* val);
-
-TESS_API int TessBaseAPIInit(TessBaseAPI* handle, const char* datapath,
-                             const char* language, TessOcrEngineMode mode,
-                             char** configs, int configs_size,
-                             const STRING* vars_vec, size_t vars_vec_size,
-                             const STRING* vars_values, size_t vars_values_size,
-                             BOOL set_only_init_params);
-
-#endif  // def TESS_CAPI_INCLUDE_BASEAPI
-
-TESS_API int TessBaseAPIInit1(TessBaseAPI* handle, const char* datapath,
-                              const char* language, TessOcrEngineMode oem,
-                              char** configs, int configs_size);
-TESS_API int TessBaseAPIInit2(TessBaseAPI* handle, const char* datapath,
-                              const char* language, TessOcrEngineMode oem);
-TESS_API int TessBaseAPIInit3(TessBaseAPI* handle, const char* datapath,
-                              const char* language);
-
-TESS_API int TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath,
-                              const char* language, TessOcrEngineMode mode,
-                              char** configs, int configs_size, char** vars_vec,
-                              char** vars_values, size_t vars_vec_size,
+TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, const char *language,
+                              TessOcrEngineMode mode, char **configs, int configs_size,
+                              char **vars_vec, char **vars_values, size_t vars_vec_size,
                              BOOL set_only_non_debug_params);

-TESS_API const char* TessBaseAPIGetInitLanguagesAsString(
-    const TessBaseAPI* handle);
-TESS_API char** TessBaseAPIGetLoadedLanguagesAsVector(
-    const TessBaseAPI* handle);
-TESS_API char** TessBaseAPIGetAvailableLanguagesAsVector(
-    const TessBaseAPI* handle);
+TESS_API const char *TessBaseAPIGetInitLanguagesAsString(const TessBaseAPI *handle);
+TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(const TessBaseAPI *handle);
+TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(const TessBaseAPI *handle);

-TESS_API int TessBaseAPIInitLangMod(TessBaseAPI* handle, const char* datapath,
-                                    const char* language);
-TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI* handle);
+TESS_API int TessBaseAPIInitLangMod(TessBaseAPI *handle, const char *datapath,
+                                    const char *language);
+TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);

-TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI* handle,
-                                        const char* filename);
-TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI* handle,
-                                             const char* filename);
+TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle, const char *filename);
+TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, const char *filename);

-TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI* handle,
-                                        TessPageSegMode mode);
-TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI* handle);
+TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle, TessPageSegMode mode);
+TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);

-TESS_API char* TessBaseAPIRect(TessBaseAPI* handle,
-                               const unsigned char* imagedata,
-                               int bytes_per_pixel, int bytes_per_line,
-                               int left, int top, int width, int height);
+TESS_API char *TessBaseAPIRect(TessBaseAPI *handle, const unsigned char *imagedata,
+                               int bytes_per_pixel, int bytes_per_line, int left, int top,
+                               int width, int height);

-TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI* handle);
+TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);

-TESS_API void TessBaseAPISetImage(TessBaseAPI* handle,
-                                  const unsigned char* imagedata, int width,
-                                  int height, int bytes_per_pixel,
-                                  int bytes_per_line);
-TESS_API void TessBaseAPISetImage2(TessBaseAPI* handle, struct Pix* pix);
+TESS_API void TessBaseAPISetImage(TessBaseAPI *handle, const unsigned char *imagedata, int width,
+                                  int height, int bytes_per_pixel, int bytes_per_line);
+TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);

-TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI* handle, int ppi);
+TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);

-TESS_API void TessBaseAPISetRectangle(TessBaseAPI* handle, int left, int top,
-                                      int width, int height);
+TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, int width,
+                                      int height);

-#ifdef TESS_CAPI_INCLUDE_BASEAPI
-TESS_API void TessBaseAPISetThresholder(TessBaseAPI* handle,
-                                        TessImageThresholder* thresholder);
-#endif
+TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
+TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, struct Pixa **pixa);
+TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, struct Pixa **pixa,
+                                              int **blockids);
+TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, BOOL raw_image, int raw_padding,
+                                               struct Pixa **pixa, int **blockids, int **paraids);
+TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, struct Pixa **pixa, int **blockids);
+TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, struct Pixa **pixa);
+TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, struct Pixa **cc);
+TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
+                                                    TessPageIteratorLevel level, BOOL text_only,
+                                                    struct Pixa **pixa, int **blockids);
+TESS_API struct Boxa *TessBaseAPIGetComponentImages1(TessBaseAPI *handle,
+                                                     TessPageIteratorLevel level, BOOL text_only,
+                                                     BOOL raw_image, int raw_padding,
+                                                     struct Pixa **pixa, int **blockids,
+                                                     int **paraids);

-TESS_API struct Pix* TessBaseAPIGetThresholdedImage(TessBaseAPI* handle);
-TESS_API struct Boxa* TessBaseAPIGetRegions(TessBaseAPI* handle,
-                                            struct Pixa** pixa);
-TESS_API struct Boxa* TessBaseAPIGetTextlines(TessBaseAPI* handle,
-                                              struct Pixa** pixa,
-                                              int** blockids);
-TESS_API struct Boxa* TessBaseAPIGetTextlines1(TessBaseAPI* handle,
-                                               BOOL raw_image, int raw_padding,
-                                               struct Pixa** pixa,
-                                               int** blockids, int** paraids);
-TESS_API struct Boxa* TessBaseAPIGetStrips(TessBaseAPI* handle,
-                                           struct Pixa** pixa, int** blockids);
-TESS_API struct Boxa* TessBaseAPIGetWords(TessBaseAPI* handle,
-                                          struct Pixa** pixa);
-TESS_API struct Boxa* TessBaseAPIGetConnectedComponents(TessBaseAPI* handle,
-                                                        struct Pixa** cc);
-TESS_API struct Boxa* TessBaseAPIGetComponentImages(TessBaseAPI* handle,
-                                                    TessPageIteratorLevel level,
-                                                    BOOL text_only,
-                                                    struct Pixa** pixa,
-                                                    int** blockids);
-TESS_API struct Boxa* TessBaseAPIGetComponentImages1(
-    TessBaseAPI* handle, TessPageIteratorLevel level, BOOL text_only,
-    BOOL raw_image, int raw_padding, struct Pixa** pixa, int** blockids,
-    int** paraids);
+TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(const TessBaseAPI *handle);

-TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
-    const TessBaseAPI* handle);
+TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);

-TESS_API TessPageIterator* TessBaseAPIAnalyseLayout(TessBaseAPI* handle);
+TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);

-TESS_API int TessBaseAPIRecognize(TessBaseAPI* handle, ETEXT_DESC* monitor);
+TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
+                                      const char *retry_config, int timeout_millisec,
+                                      TessResultRenderer *renderer);
+TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, int page_index,
+                                     const char *filename, const char *retry_config,
+                                     int timeout_millisec, TessResultRenderer *renderer);
+
+TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
+TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(TessBaseAPI *handle);
+
+TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
+TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
+
+TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
+TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
+
+TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
+TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
+TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, int page_number);
+
+TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
+TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
+
+TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);

 #ifndef DISABLED_LEGACY_ENGINE
-TESS_API int TessBaseAPIRecognizeForChopTest(TessBaseAPI* handle,
-                                             ETEXT_DESC* monitor);
-#endif
+TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, TessPageSegMode mode,
+                                        const char *wordstr);
+#endif // #ifndef DISABLED_LEGACY_ENGINE

-TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI* handle, const char* filename,
-                                      const char* retry_config,
-                                      int timeout_millisec,
-                                      TessResultRenderer* renderer);
-TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI* handle, struct Pix* pix,
-                                     int page_index, const char* filename,
-                                     const char* retry_config,
-                                     int timeout_millisec,
-                                     TessResultRenderer* renderer);
+TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
+TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);

-TESS_API TessResultIterator* TessBaseAPIGetIterator(TessBaseAPI* handle);
-TESS_API TessMutableIterator* TessBaseAPIGetMutableIterator(
-    TessBaseAPI* handle);
+TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
+TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, float *out_slope);

-TESS_API char* TessBaseAPIGetUTF8Text(TessBaseAPI* handle);
-TESS_API char* TessBaseAPIGetHOCRText(TessBaseAPI* handle, int page_number);
+TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);

-TESS_API char* TessBaseAPIGetAltoText(TessBaseAPI* handle, int page_number);
-TESS_API char* TessBaseAPIGetTsvText(TessBaseAPI* handle, int page_number);
-
-TESS_API char* TessBaseAPIGetBoxText(TessBaseAPI* handle, int page_number);
-TESS_API char* TessBaseAPIGetLSTMBoxText(TessBaseAPI* handle, int page_number);
-TESS_API char* TessBaseAPIGetWordStrBoxText(TessBaseAPI* handle,
-                                            int page_number);
-
-TESS_API char* TessBaseAPIGetUNLVText(TessBaseAPI* handle);
-TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI* handle);
-
-TESS_API int* TessBaseAPIAllWordConfidences(TessBaseAPI* handle);
+TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);

 #ifndef DISABLED_LEGACY_ENGINE
-TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI* handle,
-                                        TessPageSegMode mode,
-                                        const char* wordstr);
-#endif  // ndef DISABLED_LEGACY_ENGINE
-
-TESS_API void TessBaseAPIClear(TessBaseAPI* handle);
-TESS_API void TessBaseAPIEnd(TessBaseAPI* handle);
-
-TESS_API int TessBaseAPIIsValidWord(TessBaseAPI* handle, const char* word);
-TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI* handle, int* out_offset,
-                                          float* out_slope);
-
-#ifdef TESS_CAPI_INCLUDE_BASEAPI
-
-TESS_API void TessBaseAPISetDictFunc(TessBaseAPI* handle, TessDictFunc f);
-
-TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI* handle);
-
-TESS_API void TessBaseAPISetProbabilityInContextFunc(
-    TessBaseAPI* handle, TessProbabilityInContextFunc f);

 // Call TessDeleteText(*best_script_name) to free memory allocated by this
 // function
-TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI* handle,
-                                                 int* orient_deg,
-                                                 float* orient_conf,
-                                                 const char** script_name,
-                                                 float* script_conf);
+TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, int *orient_deg,
+                                                 float *orient_conf, const char **script_name,
+                                                 float *script_conf);
+#endif // #ifndef DISABLED_LEGACY_ENGINE

-#endif  // def TESS_CAPI_INCLUDE_BASEAPI
+TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, double margin);

-TESS_API const char* TessBaseAPIGetUnichar(TessBaseAPI* handle, int unichar_id);
+TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);

-TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI* handle,
-                                                 double margin);
+TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);

-#ifdef TESS_CAPI_INCLUDE_BASEAPI
-
-TESS_API const TessDawg* TessBaseAPIGetDawg(const TessBaseAPI* handle, int i);
-
-TESS_API int TessBaseAPINumDawgs(const TessBaseAPI* handle);
-
-TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI* handle);
-
-TESS_API void TessBaseAPIInitTruthCallback(TessBaseAPI* handle,
-                                           TessTruthCallback cb);
-
-TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI* handle,
-                                               int** block_orientation,
-                                               bool** vertical_writing);
-
-#endif
+TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, int **block_orientation,
+                                               bool **vertical_writing);

 /* Page iterator */

-TESS_API void TessPageIteratorDelete(TessPageIterator* handle);
+TESS_API void TessPageIteratorDelete(TessPageIterator *handle);

-TESS_API TessPageIterator* TessPageIteratorCopy(const TessPageIterator* handle);
+TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);

-TESS_API void TessPageIteratorBegin(TessPageIterator* handle);
+TESS_API void TessPageIteratorBegin(TessPageIterator *handle);

-TESS_API BOOL TessPageIteratorNext(TessPageIterator* handle,
-                                   TessPageIteratorLevel level);
+TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle, TessPageIteratorLevel level);

-TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator* handle,
+TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
                                              TessPageIteratorLevel level);

-TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator* handle,
+TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
                                               TessPageIteratorLevel level,
                                               TessPageIteratorLevel element);

-TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator* handle,
-                                          TessPageIteratorLevel level,
-                                          int* left, int* top, int* right,
-                                          int* bottom);
+TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
+                                          TessPageIteratorLevel level, int *left, int *top,
+                                          int *right, int *bottom);

-TESS_API TessPolyBlockType
-TessPageIteratorBlockType(const TessPageIterator* handle);
+TESS_API TessPolyBlockType TessPageIteratorBlockType(const TessPageIterator *handle);

-TESS_API struct Pix* TessPageIteratorGetBinaryImage(
-    const TessPageIterator* handle, TessPageIteratorLevel level);
+TESS_API struct Pix *TessPageIteratorGetBinaryImage(const TessPageIterator *handle,
+                                                    TessPageIteratorLevel level);

-TESS_API struct Pix* TessPageIteratorGetImage(const TessPageIterator* handle,
-                                              TessPageIteratorLevel level,
-                                              int padding,
-                                              struct Pix* original_image,
-                                              int* left, int* top);
+TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
+                                              TessPageIteratorLevel level, int padding,
+                                              struct Pix *original_image, int *left, int *top);

-TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator* handle,
-                                       TessPageIteratorLevel level, int* x1,
-                                       int* y1, int* x2, int* y2);
+TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle, TessPageIteratorLevel level,
+                                       int *x1, int *y1, int *x2, int *y2);

-TESS_API void TessPageIteratorOrientation(
-    TessPageIterator* handle, TessOrientation* orientation,
-    TessWritingDirection* writing_direction, TessTextlineOrder* textline_order,
-    float* deskew_angle);
+TESS_API void TessPageIteratorOrientation(TessPageIterator *handle, TessOrientation *orientation,
+                                          TessWritingDirection *writing_direction,
+                                          TessTextlineOrder *textline_order, float *deskew_angle);

-TESS_API void TessPageIteratorParagraphInfo(
-    TessPageIterator* handle, TessParagraphJustification* justification,
-    BOOL* is_list_item, BOOL* is_crown, int* first_line_indent);
+TESS_API void TessPageIteratorParagraphInfo(TessPageIterator *handle,
+                                            TessParagraphJustification *justification,
+                                            BOOL *is_list_item, BOOL *is_crown,
+                                            int *first_line_indent);

 /* Result iterator */

-TESS_API void TessResultIteratorDelete(TessResultIterator* handle);
-TESS_API TessResultIterator* TessResultIteratorCopy(
-    const TessResultIterator* handle);
-TESS_API TessPageIterator* TessResultIteratorGetPageIterator(
-    TessResultIterator* handle);
-TESS_API const TessPageIterator* TessResultIteratorGetPageIteratorConst(
-    const TessResultIterator* handle);
-TESS_API TessChoiceIterator* TessResultIteratorGetChoiceIterator(
-    const TessResultIterator* handle);
+TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
+TESS_API TessResultIterator *TessResultIteratorCopy(const TessResultIterator *handle);
+TESS_API TessPageIterator *TessResultIteratorGetPageIterator(TessResultIterator *handle);
+TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
+    const TessResultIterator *handle);
+TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(const TessResultIterator *handle);

-TESS_API BOOL TessResultIteratorNext(TessResultIterator* handle,
-                                     TessPageIteratorLevel level);
-TESS_API char* TessResultIteratorGetUTF8Text(const TessResultIterator* handle,
+TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle, TessPageIteratorLevel level);
+TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
                                             TessPageIteratorLevel level);
-TESS_API float TessResultIteratorConfidence(const TessResultIterator* handle,
+TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
                                            TessPageIteratorLevel level);
-TESS_API const char* TessResultIteratorWordRecognitionLanguage(
-    const TessResultIterator* handle);
-TESS_API const char* TessResultIteratorWordFontAttributes(
-    const TessResultIterator* handle, BOOL* is_bold, BOOL* is_italic,
-    BOOL* is_underlined, BOOL* is_monospace, BOOL* is_serif, BOOL* is_smallcaps,
-    int* pointsize, int* font_id);
+TESS_API const char *TessResultIteratorWordRecognitionLanguage(const TessResultIterator *handle);
+TESS_API const char *TessResultIteratorWordFontAttributes(const TessResultIterator *handle,
+                                                          BOOL *is_bold, BOOL *is_italic,
+                                                          BOOL *is_underlined, BOOL *is_monospace,
+                                                          BOOL *is_serif, BOOL *is_smallcaps,
+                                                          int *pointsize, int *font_id);

-TESS_API BOOL
-TessResultIteratorWordIsFromDictionary(const TessResultIterator* handle);
-TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator* handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSuperscript(const TessResultIterator* handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsSubscript(const TessResultIterator* handle);
-TESS_API BOOL
-TessResultIteratorSymbolIsDropcap(const TessResultIterator* handle);
+TESS_API BOOL TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
+TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
+TESS_API BOOL TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
+TESS_API BOOL TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
+TESS_API BOOL TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);

-TESS_API void TessChoiceIteratorDelete(TessChoiceIterator* handle);
-TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator* handle);
-TESS_API const char* TessChoiceIteratorGetUTF8Text(
-    const TessChoiceIterator* handle);
-TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator* handle);
+TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
+TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
+TESS_API const char *TessChoiceIteratorGetUTF8Text(const TessChoiceIterator *handle);
+TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);

 /* Progress monitor */

-TESS_API ETEXT_DESC* TessMonitorCreate();
-TESS_API void TessMonitorDelete(ETEXT_DESC* monitor);
-TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC* monitor,
-                                       TessCancelFunc cancelFunc);
-TESS_API void TessMonitorSetCancelThis(ETEXT_DESC* monitor, void* cancelThis);
-TESS_API void* TessMonitorGetCancelThis(ETEXT_DESC* monitor);
-TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC* monitor,
-                                         TessProgressFunc progressFunc);
-TESS_API int TessMonitorGetProgress(ETEXT_DESC* monitor);
-TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC* monitor, int deadline);
-
-#ifndef DISABLED_LEGACY_ENGINE
-
-#  ifdef TESS_CAPI_INCLUDE_BASEAPI
-TESS_API void TessBaseAPISetFillLatticeFunc(TessBaseAPI* handle,
-                                            TessFillLatticeFunc f);
-
-TESS_API void TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob,
-                                            INT_FEATURE_STRUCT* int_features,
-                                            int* num_features,
-                                            int* FeatureOutlineIndex);
-
-TESS_API ROW* TessFindRowForBox(BLOCK_LIST* blocks, int left, int top,
-                                int right, int bottom);
-
-TESS_API void TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob,
-                                               int num_max_matches,
-                                               int* unichar_ids, float* ratings,
-                                               int* num_matches_returned);
-
-TESS_API ROW* TessMakeTessOCRRow(float baseline, float xheight, float descender,
-                                 float ascender);
-
-TESS_API TBLOB* TessMakeTBLOB(Pix* pix);
-
-TESS_API void TessNormalizeTBLOB(TBLOB* tblob, ROW* row, BOOL numeric_mode);
-
-TESS_API BLOCK_LIST* TessBaseAPIFindLinesCreateBlockList(TessBaseAPI* handle);
-
-TESS_API void TessDeleteBlockList(BLOCK_LIST* block_list);
-
-#  endif  // def TESS_CAPI_INCLUDE_BASEAPI
-
-#endif  // ndef DISABLED_LEGACY_ENGINE
+TESS_API ETEXT_DESC *TessMonitorCreate();
+TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
+TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, TessCancelFunc cancelFunc);
+TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
+TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
+TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, TessProgressFunc progressFunc);
+TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
+TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);

 #ifdef __cplusplus
 }
 #endif

-#endif  // API_CAPI_H_
+#endif // API_CAPI_H_
--- a/include/tesseract/platform.h
+++ b/include/tesseract/platform.h
@ -1,5 +1,5 @@
 ///////////////////////////////////////////////////////////////////////
-// File:        platform.h
+// File:        export.h
 // Description: Place holder
 //
 // (C) Copyright 2006, Google Inc.
@ -15,45 +15,25 @@
 //
 ///////////////////////////////////////////////////////////////////////

-#ifndef TESSERACT_CCUTIL_PLATFORM_H_
-#define TESSERACT_CCUTIL_PLATFORM_H_
+#ifndef TESSERACT_PLATFORM_H_
+#define TESSERACT_PLATFORM_H_

-#define DLLSYM
-#ifndef _WIN32
-#  ifdef __cplusplus
-#    include <climits>
-#  else /* C compiler*/
-#    include <limits.h>
-#  endif /* __cplusplus */
-#  ifndef PATH_MAX
-#    define MAX_PATH 4096
-#  else
-#    define MAX_PATH PATH_MAX
-#  endif
-#endif
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-#  if defined(TESS_EXPORTS)
-#    define TESS_API __declspec(dllexport)
-#  elif defined(TESS_IMPORTS)
-#    define TESS_API __declspec(dllimport)
-#  else
-#    define TESS_API
-#  endif
-#  define TESS_LOCAL
-#else
-#  if __GNUC__ >= 4
-#    if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
-#      define TESS_API __attribute__((visibility("default")))
-#      define TESS_LOCAL __attribute__((visibility("hidden")))
+#ifndef TESS_API
+#  if defined(_WIN32) || defined(__CYGWIN__)
+#    if defined(TESS_EXPORTS)
+#      define TESS_API __declspec(dllexport)
+#    elif defined(TESS_IMPORTS)
+#      define TESS_API __declspec(dllimport)
 #    else
 #      define TESS_API
-#      define TESS_LOCAL
 #    endif
 #  else
-#    define TESS_API
-#    define TESS_LOCAL
+#    if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
+#      define TESS_API __attribute__((visibility("default")))
+#    else
+#      define TESS_API
+#    endif
 #  endif
 #endif

-#endif  // TESSERACT_CCUTIL_PLATFORM_H_
+#endif // TESSERACT_PLATFORM_H_
--- a/include/tesseract/ltrresultiterator.h
+++ b/include/tesseract/ltrresultiterator.h
@ -20,17 +20,17 @@
 #ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
 #define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_

-#include "pageiterator.h"  // for PageIterator
-#include "platform.h"      // for TESS_API
-#include "publictypes.h"   // for PageIteratorLevel
-#include "unichar.h"       // for StrongScriptDirection
+#include "export.h"       // for TESS_API
+#include "pageiterator.h" // for PageIterator
+#include "publictypes.h"  // for PageIteratorLevel
+#include "unichar.h"      // for StrongScriptDirection
+
+namespace tesseract {

 class BLOB_CHOICE_IT;
 class PAGE_RES;
 class WERD_RES;

-namespace tesseract {
-
 class Tesseract;

 // Class to iterate over tesseract results, providing access to all levels
@ -40,14 +40,14 @@ class Tesseract;
 // therefore can only be used while the TessBaseAPI class still exists and
 // has not been subjected to a call of Init, SetImage, Recognize, Clear, End
 // DetectOS, or anything else that changes the internal PAGE_RES.
-// See apitypes.h for the definition of PageIteratorLevel.
+// See tesseract/publictypes.h for the definition of PageIteratorLevel.
 // See also base class PageIterator, which contains the bulk of the interface.
 // LTRResultIterator adds text-specific methods for access to OCR output.

 class TESS_API LTRResultIterator : public PageIterator {
  friend class ChoiceIterator;

- public:
+public:
  // page_res and tesseract come directly from the BaseAPI.
  // The rectangle parameters are copied indirectly from the Thresholder,
  // via the BaseAPI. They represent the coordinates of some rectangle in an
@ -60,9 +60,8 @@ class TESS_API LTRResultIterator : public PageIterator {
  // The scaled_yres indicates the effective resolution of the binary image
  // that tesseract has been given by the Thresholder.
  // After the constructor, Begin has already been called.
-  LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
-                    int scaled_yres, int rect_left, int rect_top,
-                    int rect_width, int rect_height);
+  LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
+                    int rect_left, int rect_top, int rect_width, int rect_height);

  ~LTRResultIterator() override;

@ -81,21 +80,20 @@ class TESS_API LTRResultIterator : public PageIterator {

  // Returns the null terminated UTF-8 encoded text string for the current
  // object at the given level. Use delete [] to free after use.
-  char* GetUTF8Text(PageIteratorLevel level) const;
+  char *GetUTF8Text(PageIteratorLevel level) const;

  // Set the string inserted at the end of each text line. "\n" by default.
-  void SetLineSeparator(const char* new_line);
+  void SetLineSeparator(const char *new_line);

  // Set the string inserted at the end of each paragraph. "\n" by default.
-  void SetParagraphSeparator(const char* new_para);
+  void SetParagraphSeparator(const char *new_para);

  // Returns the mean confidence of the current object at the given level.
  // The number should be interpreted as a percent probability. (0.0f-100.0f)
  float Confidence(PageIteratorLevel level) const;

  // Returns the attributes of the current row.
-  void RowAttributes(float* row_height, float* descenders,
-                     float* ascenders) const;
+  void RowAttributes(float *row_height, float *descenders, float *ascenders) const;

  // ============= Functions that refer to words only ============.

@ -107,14 +105,13 @@ class TESS_API LTRResultIterator : public PageIterator {
  // the iterator itself, ie rendered invalid by various members of
  // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
  // Pointsize is returned in printers points (1/72 inch.)
-  const char* WordFontAttributes(bool* is_bold, bool* is_italic,
-                                 bool* is_underlined, bool* is_monospace,
-                                 bool* is_serif, bool* is_smallcaps,
-                                 int* pointsize, int* font_id) const;
+  const char *WordFontAttributes(bool *is_bold, bool *is_italic, bool *is_underlined,
+                                 bool *is_monospace, bool *is_serif, bool *is_smallcaps,
+                                 int *pointsize, int *font_id) const;

  // Return the name of the language used to recognize this word.
  // On error, nullptr.  Do not delete this pointer.
-  const char* WordRecognitionLanguage() const;
+  const char *WordRecognitionLanguage() const;

  // Return the overall directionality of this word.
  StrongScriptDirection WordDirection() const;
@ -133,34 +130,34 @@ class TESS_API LTRResultIterator : public PageIterator {

  // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
  // of the current word.
-  const void* GetParamsTrainingBundle() const;
+  const void *GetParamsTrainingBundle() const;

  // Returns a pointer to the string with blamer information for this word.
  // Assumes that the word's blamer_bundle is not nullptr.
-  const char* GetBlamerDebug() const;
+  const char *GetBlamerDebug() const;

  // Returns a pointer to the string with misadaption information for this word.
  // Assumes that the word's blamer_bundle is not nullptr.
-  const char* GetBlamerMisadaptionDebug() const;
+  const char *GetBlamerMisadaptionDebug() const;

  // Returns true if a truth string was recorded for the current word.
  bool HasTruthString() const;

  // Returns true if the given string is equivalent to the truth string for
  // the current word.
-  bool EquivalentToTruth(const char* str) const;
+  bool EquivalentToTruth(const char *str) const;

  // Returns a null terminated UTF-8 encoded truth string for the current word.
  // Use delete [] to free after use.
-  char* WordTruthUTF8Text() const;
+  char *WordTruthUTF8Text() const;

  // Returns a null terminated UTF-8 encoded normalized OCR string for the
  // current word. Use delete [] to free after use.
-  char* WordNormedUTF8Text() const;
+  char *WordNormedUTF8Text() const;

  // Returns a pointer to serialized choice lattice.
  // Fills lattice_size with the number of bytes in lattice data.
-  const char* WordLattice(int* lattice_size) const;
+  const char *WordLattice(int *lattice_size) const;

  // ============= Functions that refer to symbols only ============.

@ -177,18 +174,18 @@ class TESS_API LTRResultIterator : public PageIterator {
  // this will return the attributes of the first symbol in that word.
  bool SymbolIsDropcap() const;

- protected:
-  const char* line_separator_;
-  const char* paragraph_separator_;
+protected:
+  const char *line_separator_;
+  const char *paragraph_separator_;
 };

 // Class to iterate over the classifier choices for a single RIL_SYMBOL.
-class ChoiceIterator {
- public:
+class TESS_API ChoiceIterator {
+public:
  // Construction is from a LTRResultIterator that points to the symbol of
  // interest. The ChoiceIterator allows a one-shot iteration over the
  // choices for this symbol and after that is is useless.
-  explicit ChoiceIterator(const LTRResultIterator& result_it);
+  explicit ChoiceIterator(const LTRResultIterator &result_it);
  ~ChoiceIterator();

  // Moves to the next choice for the symbol and returns false if there
@ -201,7 +198,7 @@ class ChoiceIterator {
  // choice.
  // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
  // internal structure and should NOT be delete[]ed to free after use.
-  const char* GetUTF8Text() const;
+  const char *GetUTF8Text() const;

  // Returns the confidence of the current choice depending on the used language
  // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
@ -215,19 +212,19 @@ class ChoiceIterator {
  // selected symbol. A timestep is a vector containing pairs of symbols and
  // floating point numbers. The number states the probability for the
  // corresponding symbol.
-  std::vector<std::vector<std::pair<const char*, float>>>* Timesteps() const;
+  std::vector<std::vector<std::pair<const char *, float>>> *Timesteps() const;

- private:
+private:
  // clears the remaining spaces out of the results and adapt the probabilities
  void filterSpaces();
  // Pointer to the WERD_RES object owned by the API.
-  WERD_RES* word_res_;
+  WERD_RES *word_res_;
  // Iterator over the blob choices.
-  BLOB_CHOICE_IT* choice_it_;
-  std::vector<std::pair<const char*, float>>* LSTM_choices_ = nullptr;
-  std::vector<std::pair<const char*, float>>::iterator LSTM_choice_it_;
+  BLOB_CHOICE_IT *choice_it_;
+  std::vector<std::pair<const char *, float>> *LSTM_choices_ = nullptr;
+  std::vector<std::pair<const char *, float>>::iterator LSTM_choice_it_;

-  const int* tstep_index_;
+  const int *tstep_index_;
  // regulates the rating granularity
  double rating_coefficient_;
  // leading blanks
@ -236,6 +233,6 @@ class ChoiceIterator {
  bool oemLSTM_;
 };

-}  // namespace tesseract.
+} // namespace tesseract.

-#endif  // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
+#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
--- a/include/tesseract/ocrclass.h
+++ b/include/tesseract/ocrclass.h
@ -29,6 +29,8 @@
 #include <chrono>
 #include <ctime>

+namespace tesseract {
+
 /**********************************************************************
 * EANYCODE_CHAR
 * Description of a single character. The character code is defined by
@ -53,7 +55,7 @@
 * version.
 **********************************************************************/

-typedef struct { /*single character */
+struct EANYCODE_CHAR { /*single character */
  // It should be noted that the format for char_code for version 2.0 and beyond
  // is UTF8 which means that ASCII characters will come out as one structure
  // but other characters will be returned in two or more instances of this
@ -72,7 +74,7 @@ typedef struct { /*single character */
  uint8_t point_size; /*of char, 72=i inch, (10) */
  int8_t blanks;      /*no of spaces before this char (1) */
  uint8_t formatting; /*char formatting (0) */
-} EANYCODE_CHAR;      /*single character */
+};

 /**********************************************************************
 * ETEXT_DESC
@ -92,62 +94,58 @@ typedef struct { /*single character */
 **********************************************************************/
 class ETEXT_DESC;

-using CANCEL_FUNC = bool (*)(void*, int);
+using CANCEL_FUNC = bool (*)(void *, int);
 using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
-using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC*, int, int, int, int);
+using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);

-class ETEXT_DESC {  // output header
- public:
-  int16_t count{0};     /// chars in this buffer(0)
-  int16_t progress{0};  /// percent complete increasing (0-100)
+class ETEXT_DESC { // output header
+public:
+  int16_t count{0};    /// chars in this buffer(0)
+  int16_t progress{0}; /// percent complete increasing (0-100)
  /** Progress monitor covers word recognition and it does not cover layout
   * analysis.
   * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
-  int8_t more_to_come{0};        /// true if not last
-  volatile int8_t ocr_alive{0};  /// ocr sets to 1, HP 0
-  int8_t err_code{0};            /// for errcode use
-  CANCEL_FUNC cancel{nullptr};   /// returns true to cancel
-  PROGRESS_FUNC progress_callback{
-      nullptr};                       /// called whenever progress increases
-  PROGRESS_FUNC2 progress_callback2;  /// monitor-aware progress callback
-  void* cancel_this{nullptr};         /// this or other data for cancel
+  int8_t more_to_come{0};                   /// true if not last
+  volatile int8_t ocr_alive{0};             /// ocr sets to 1, HP 0
+  int8_t err_code{0};                       /// for errcode use
+  CANCEL_FUNC cancel{nullptr};              /// returns true to cancel
+  PROGRESS_FUNC progress_callback{nullptr}; /// called whenever progress increases
+  PROGRESS_FUNC2 progress_callback2;        /// monitor-aware progress callback
+  void *cancel_this{nullptr};               /// this or other data for cancel
  std::chrono::steady_clock::time_point end_time;
  /// Time to stop. Expected to be set only
  /// by call to set_deadline_msecs().
-  EANYCODE_CHAR text[1]{};  /// character data
+  EANYCODE_CHAR text[1]{}; /// character data

  ETEXT_DESC() : progress_callback2(&default_progress_func) {
-    end_time = std::chrono::time_point<std::chrono::steady_clock,
-                                       std::chrono::milliseconds>();
+    end_time = std::chrono::time_point<std::chrono::steady_clock, std::chrono::milliseconds>();
  }

  // Sets the end time to be deadline_msecs milliseconds from now.
  void set_deadline_msecs(int32_t deadline_msecs) {
    if (deadline_msecs > 0) {
-      end_time = std::chrono::steady_clock::now() +
-                 std::chrono::milliseconds(deadline_msecs);
+      end_time = std::chrono::steady_clock::now() + std::chrono::milliseconds(deadline_msecs);
    }
  }

  // Returns false if we've not passed the end_time, or have not set a deadline.
  bool deadline_exceeded() const {
-    if (end_time.time_since_epoch() ==
-        std::chrono::steady_clock::duration::zero()) {
+    if (end_time.time_since_epoch() == std::chrono::steady_clock::duration::zero()) {
      return false;
    }
    auto now = std::chrono::steady_clock::now();
    return (now > end_time);
  }

- private:
-  static bool default_progress_func(ETEXT_DESC* ths, int left, int right,
-                                    int top, int bottom) {
+private:
+  static bool default_progress_func(ETEXT_DESC *ths, int left, int right, int top, int bottom) {
    if (ths->progress_callback != nullptr) {
-      return (*(ths->progress_callback))(ths->progress, left, right, top,
-                                         bottom);
+      return (*(ths->progress_callback))(ths->progress, left, right, top, bottom);
    }
    return true;
  }
 };

-#endif  // CCUTIL_OCRCLASS_H_
+} // namespace tesseract
+
+#endif // CCUTIL_OCRCLASS_H_
--- a/include/tesseract/osdetect.h
+++ b/include/tesseract/osdetect.h
@ -20,27 +20,25 @@
 #ifndef TESSERACT_CCMAIN_OSDETECT_H_
 #define TESSERACT_CCMAIN_OSDETECT_H_

-#include "platform.h"  // for TESS_API
+#include "export.h" // for TESS_API
+
+#include <vector> // for std::vector
+
+namespace tesseract {

 class BLOBNBOX;
 class BLOBNBOX_CLIST;
 class BLOB_CHOICE_LIST;
-class STRING;
 class TO_BLOCK_LIST;
 class UNICHARSET;
-template <typename T>
-class GenericVector;

-namespace tesseract {
 class Tesseract;
-}

 // Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
 const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;

 struct OSBestResult {
-  OSBestResult()
-      : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
+  OSBestResult() : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
  int orientation_id;
  int script_id;
  float sconfidence;
@ -50,7 +48,8 @@ struct OSBestResult {
 struct OSResults {
  OSResults() : unicharset(nullptr) {
    for (int i = 0; i < 4; ++i) {
-      for (int j = 0; j < kMaxNumberOfScripts; ++j) scripts_na[i][j] = 0;
+      for (int j = 0; j < kMaxNumberOfScripts; ++j)
+        scripts_na[i][j] = 0;
      orientations[i] = 0;
    }
  }
@ -63,7 +62,7 @@ struct OSResults {
  // Return the index of the script with the highest score for this orientation.
  TESS_API int get_best_script(int orientation_id) const;
  // Accumulate scores with given OSResults instance and update the best script.
-  void accumulate(const OSResults& osr);
+  void accumulate(const OSResults &osr);

  // Print statistics.
  void print_scores(void) const;
@ -77,34 +76,33 @@ struct OSResults {
  // Script confidence scores for each of 4 possible orientations.
  float scripts_na[4][kMaxNumberOfScripts];

-  UNICHARSET* unicharset;
+  UNICHARSET *unicharset;
  OSBestResult best_result;
 };

 class OrientationDetector {
- public:
-  OrientationDetector(const GenericVector<int>* allowed_scripts,
-                      OSResults* results);
-  bool detect_blob(BLOB_CHOICE_LIST* scores);
+public:
+  OrientationDetector(const std::vector<int> *allowed_scripts, OSResults *results);
+  bool detect_blob(BLOB_CHOICE_LIST *scores);
  int get_orientation();

- private:
-  OSResults* osr_;
-  const GenericVector<int>* allowed_scripts_;
+private:
+  OSResults *osr_;
+  const std::vector<int> *allowed_scripts_;
 };

 class ScriptDetector {
- public:
-  ScriptDetector(const GenericVector<int>* allowed_scripts, OSResults* osr,
-                 tesseract::Tesseract* tess);
-  void detect_blob(BLOB_CHOICE_LIST* scores);
+public:
+  ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,
+                 tesseract::Tesseract *tess);
+  void detect_blob(BLOB_CHOICE_LIST *scores);
  bool must_stop(int orientation);

- private:
-  OSResults* osr_;
-  static const char* korean_script_;
-  static const char* japanese_script_;
-  static const char* fraktur_script_;
+private:
+  OSResults *osr_;
+  static const char *korean_script_;
+  static const char *japanese_script_;
+  static const char *fraktur_script_;
  int korean_id_;
  int japanese_id_;
  int katakana_id_;
@ -113,26 +111,25 @@ class ScriptDetector {
  int hangul_id_;
  int latin_id_;
  int fraktur_id_;
-  tesseract::Tesseract* tess_;
-  const GenericVector<int>* allowed_scripts_;
+  tesseract::Tesseract *tess_;
+  const std::vector<int> *allowed_scripts_;
 };

-int orientation_and_script_detection(STRING& filename, OSResults*,
-                                     tesseract::Tesseract*);
+int orientation_and_script_detection(const char *filename, OSResults *, tesseract::Tesseract *);

-int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
-              tesseract::Tesseract* tess);
+int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, tesseract::Tesseract *tess);

-int os_detect_blobs(const GenericVector<int>* allowed_scripts,
-                    BLOBNBOX_CLIST* blob_list, OSResults* osr,
-                    tesseract::Tesseract* tess);
+int os_detect_blobs(const std::vector<int> *allowed_scripts, BLOBNBOX_CLIST *blob_list,
+                    OSResults *osr, tesseract::Tesseract *tess);

-bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o, ScriptDetector* s,
-                    OSResults*, tesseract::Tesseract* tess);
+bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, OSResults *,
+                    tesseract::Tesseract *tess);

 // Helper method to convert an orientation index to its value in degrees.
 // The value represents the amount of clockwise rotation in degrees that must be
 // applied for the text to be upright (readable).
-TESS_API int OrientationIdToValue(const int& id);
+TESS_API int OrientationIdToValue(const int &id);

-#endif  // TESSERACT_CCMAIN_OSDETECT_H_
+} // namespace tesseract
+
+#endif // TESSERACT_CCMAIN_OSDETECT_H_
--- a/include/tesseract/pageiterator.h
+++ b/include/tesseract/pageiterator.h
@ -3,7 +3,6 @@
 // Description: Iterator for tesseract page structure that avoids using
 //              tesseract internal data structures.
 // Author:      Ray Smith
-// Created:     Fri Feb 26 11:01:06 PST 2010
 //
 // (C) Copyright 2010, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -21,18 +20,19 @@
 #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
 #define TESSERACT_CCMAIN_PAGEITERATOR_H_

-#include "platform.h"
+#include "export.h"
 #include "publictypes.h"

+struct Pix;
+struct Pta;
+
+namespace tesseract {
+
 struct BlamerBundle;
 class C_BLOB_IT;
 class PAGE_RES;
 class PAGE_RES_IT;
 class WERD;
-struct Pix;
-struct Pta;
-
-namespace tesseract {

 class Tesseract;

@ -44,13 +44,13 @@ class Tesseract;
 * therefore can only be used while the TessBaseAPI class still exists and
 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
 * DetectOS, or anything else that changes the internal PAGE_RES.
- * See apitypes.h for the definition of PageIteratorLevel.
+ * See tesseract/publictypes.h for the definition of PageIteratorLevel.
 * See also ResultIterator, derived from PageIterator, which adds in the
 * ability to access OCR output with text-specific methods.
 */

 class TESS_API PageIterator {
- public:
+public:
  /**
   * page_res and tesseract come directly from the BaseAPI.
   * The rectangle parameters are copied indirectly from the Thresholder,
@ -65,9 +65,8 @@ class TESS_API PageIterator {
   * that tesseract has been given by the Thresholder.
   * After the constructor, Begin has already been called.
   */
-  PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
-               int scaled_yres, int rect_left, int rect_top, int rect_width,
-               int rect_height);
+  PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left,
+               int rect_top, int rect_width, int rect_height);
  virtual ~PageIterator();

  /**
@ -76,11 +75,11 @@ class TESS_API PageIterator {
   * objects at a higher level. These constructors DO NOT CALL Begin, so
   * iterations will continue from the location of src.
   */
-  PageIterator(const PageIterator& src);
-  const PageIterator& operator=(const PageIterator& src);
+  PageIterator(const PageIterator &src);
+  const PageIterator &operator=(const PageIterator &src);

  /** Are we positioned at the same location as other? */
-  bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
+  bool PositionedAtSameWord(const PAGE_RES_IT *other) const;

  // ============= Moving around within the page ============.

@ -154,8 +153,7 @@ class TESS_API PageIterator {
   *  it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
   *  it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
   */
-  virtual bool IsAtFinalElement(PageIteratorLevel level,
-                                PageIteratorLevel element) const;
+  virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const;

  /**
   * Returns whether this iterator is positioned
@ -163,7 +161,7 @@ class TESS_API PageIterator {
   *   equal to other:  0
   *   after other:     1
   */
-  int Cmp(const PageIterator& other) const;
+  int Cmp(const PageIterator &other) const;

  // ============= Accessing data ==============.
  // Coordinate system:
@ -187,8 +185,7 @@ class TESS_API PageIterator {
   * where the placement is obvious, and after recognition, it doesn't make as
   * much difference, as the diacritics will already be included in the word.
   */
-  void SetBoundingBoxComponents(bool include_upper_dots,
-                                bool include_lower_dots) {
+  void SetBoundingBoxComponents(bool include_upper_dots, bool include_lower_dots) {
    include_upper_dots_ = include_upper_dots;
    include_lower_dots_ = include_lower_dots;
  }
@ -202,24 +199,23 @@ class TESS_API PageIterator {
   * from a grey image. The padding argument to GetImage can be used to expand
   * the image to include more foreground pixels. See GetImage below.
   */
-  bool BoundingBox(PageIteratorLevel level, int* left, int* top, int* right,
-                   int* bottom) const;
-  bool BoundingBox(PageIteratorLevel level, int padding, int* left, int* top,
-                   int* right, int* bottom) const;
+  bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const;
+  bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, int *right,
+                   int *bottom) const;
  /**
   * Returns the bounding rectangle of the object in a coordinate system of the
   * working image rectangle having its origin at (rect_left_, rect_top_) with
   * respect to the original image and is scaled by a factor scale_.
   */
-  bool BoundingBoxInternal(PageIteratorLevel level, int* left, int* top,
-                           int* right, int* bottom) const;
+  bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right,
+                           int *bottom) const;

  /** Returns whether there is no object of a given level. */
  bool Empty(PageIteratorLevel level) const;

  /**
-   * Returns the type of the current block. See apitypes.h for
-   * PolyBlockType.
+   * Returns the type of the current block.
+   * See tesseract/publictypes.h for PolyBlockType.
   */
  PolyBlockType BlockType() const;

@ -230,7 +226,7 @@ class TESS_API PageIterator {
   * point and the first point. nullptr will be returned if the iterator is
   * at the end of the document or layout analysis was not used.
   */
-  Pta* BlockPolygon() const;
+  Pta *BlockPolygon() const;

  /**
   * Returns a binary image of the current object at the given level.
@ -238,7 +234,7 @@ class TESS_API PageIterator {
   * this could be upscaled with respect to the original input image.
   * Use pixDestroy to delete the image after use.
   */
-  Pix* GetBinaryImage(PageIteratorLevel level) const;
+  Pix *GetBinaryImage(PageIteratorLevel level) const;

  /**
   * Returns an image of the current object at the given level in greyscale
@ -251,8 +247,7 @@ class TESS_API PageIterator {
   * If you do not supply an original image, you will get a binary one.
   * Use pixDestroy to delete the image after use.
   */
-  Pix* GetImage(PageIteratorLevel level, int padding, Pix* original_img,
-                int* left, int* top) const;
+  Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const;

  /**
   * Returns the baseline of the current object at the given level.
@ -260,8 +255,7 @@ class TESS_API PageIterator {
   * WARNING: with vertical text, baselines may be vertical!
   * Returns false if there is no baseline at the current position.
   */
-  bool Baseline(PageIteratorLevel level, int* x1, int* y1, int* x2,
-                int* y2) const;
+  bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const;

  /**
   * Returns orientation for the block the iterator points to.
@ -271,10 +265,9 @@ class TESS_API PageIterator {
   *                 block anti-clockwise for it to be level?
   *                   -Pi/4 <= deskew_angle <= Pi/4
   */
-  void Orientation(tesseract::Orientation* orientation,
-                   tesseract::WritingDirection* writing_direction,
-                   tesseract::TextlineOrder* textline_order,
-                   float* deskew_angle) const;
+  void Orientation(tesseract::Orientation *orientation,
+                   tesseract::WritingDirection *writing_direction,
+                   tesseract::TextlineOrder *textline_order, float *deskew_angle) const;

  /**
   * Returns information about the current paragraph, if available.
@ -304,37 +297,36 @@ class TESS_API PageIterator {
   *             first_line_indent for subsequent paragraphs in this block
   *             of text.
   */
-  void ParagraphInfo(tesseract::ParagraphJustification* justification,
-                     bool* is_list_item, bool* is_crown,
-                     int* first_line_indent) const;
+  void ParagraphInfo(tesseract::ParagraphJustification *justification, bool *is_list_item,
+                     bool *is_crown, int *first_line_indent) const;

  // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
  // of the current word to the given pointer (takes ownership of the pointer)
  // and returns true.
  // Can only be used when iterating on the word level.
-  bool SetWordBlamerBundle(BlamerBundle* blamer_bundle);
+  bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);

- protected:
+protected:
  /**
   * Sets up the internal data for iterating the blobs of a new word, then
   * moves the iterator to the given offset.
   */
-  TESS_LOCAL void BeginWord(int offset);
+  void BeginWord(int offset);

  /** Pointer to the page_res owned by the API. */
-  PAGE_RES* page_res_;
+  PAGE_RES *page_res_;
  /** Pointer to the Tesseract object owned by the API. */
-  Tesseract* tesseract_;
+  Tesseract *tesseract_;
  /**
   * The iterator to the page_res_. Owned by this ResultIterator.
   * A pointer just to avoid dragging in Tesseract includes.
   */
-  PAGE_RES_IT* it_;
+  PAGE_RES_IT *it_;
  /**
   * The current input WERD being iterated. If there is an output from OCR,
   * then word_ is nullptr. Owned by the API
   */
-  WERD* word_;
+  WERD *word_;
  /** The length of the current word_. */
  int word_length_;
  /** The current blob index within the word. */
@ -344,7 +336,7 @@ class TESS_API PageIterator {
   * OCR results in the box_word.
   * Owned by this ResultIterator.
   */
-  C_BLOB_IT* cblob_it_;
+  C_BLOB_IT *cblob_it_;
  /** Control over what to include in bounding boxes. */
  bool include_upper_dots_;
  bool include_lower_dots_;
@ -357,6 +349,6 @@ class TESS_API PageIterator {
  int rect_height_;
 };

-}  // namespace tesseract.
+} // namespace tesseract.

-#endif  // TESSERACT_CCMAIN_PAGEITERATOR_H_
+#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
--- a/include/tesseract/publictypes.h
+++ b/include/tesseract/publictypes.h
@ -19,6 +19,8 @@
 #ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
 #define TESSERACT_CCSTRUCT_PUBLICTYPES_H_

+namespace tesseract {
+
 // This file contains types that are used both by the API and internally
 // to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
 // dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
@ -26,7 +28,6 @@
 // but not for the low-level tesseract code to include top-level API code.
 // This file should not use other Tesseract types, as that would drag
 // their includes into the API-level.
-// API-level code should include apitypes.h in preference to this file.

 /** Number of printers' points in an inch. The unit of the pointsize return. */
 constexpr int kPointsPerInch = 72;
@ -50,21 +51,21 @@ constexpr int kResolutionEstimationFactor = 10;
 * Used extensively by ColPartition, and POLY_BLOCK.
 */
 enum PolyBlockType {
-  PT_UNKNOWN,          // Type is not yet known. Keep as the first element.
-  PT_FLOWING_TEXT,     // Text that lives inside a column.
-  PT_HEADING_TEXT,     // Text that spans more than one column.
-  PT_PULLOUT_TEXT,     // Text that is in a cross-column pull-out region.
-  PT_EQUATION,         // Partition belonging to an equation region.
-  PT_INLINE_EQUATION,  // Partition has inline equation.
-  PT_TABLE,            // Partition belonging to a table region.
-  PT_VERTICAL_TEXT,    // Text-line runs vertically.
-  PT_CAPTION_TEXT,     // Text that belongs to an image.
-  PT_FLOWING_IMAGE,    // Image that lives inside a column.
-  PT_HEADING_IMAGE,    // Image that spans more than one column.
-  PT_PULLOUT_IMAGE,    // Image that is in a cross-column pull-out region.
-  PT_HORZ_LINE,        // Horizontal Line.
-  PT_VERT_LINE,        // Vertical Line.
-  PT_NOISE,            // Lies outside of any column.
+  PT_UNKNOWN,         // Type is not yet known. Keep as the first element.
+  PT_FLOWING_TEXT,    // Text that lives inside a column.
+  PT_HEADING_TEXT,    // Text that spans more than one column.
+  PT_PULLOUT_TEXT,    // Text that is in a cross-column pull-out region.
+  PT_EQUATION,        // Partition belonging to an equation region.
+  PT_INLINE_EQUATION, // Partition has inline equation.
+  PT_TABLE,           // Partition belonging to a table region.
+  PT_VERTICAL_TEXT,   // Text-line runs vertically.
+  PT_CAPTION_TEXT,    // Text that belongs to an image.
+  PT_FLOWING_IMAGE,   // Image that lives inside a column.
+  PT_HEADING_IMAGE,   // Image that spans more than one column.
+  PT_PULLOUT_IMAGE,   // Image that is in a cross-column pull-out region.
+  PT_HORZ_LINE,       // Horizontal Line.
+  PT_VERT_LINE,       // Vertical Line.
+  PT_NOISE,           // Lies outside of any column.
  PT_COUNT
 };

@ -74,14 +75,12 @@ inline bool PTIsLineType(PolyBlockType type) {
 }
 /** Returns true if PolyBlockType is of image type */
 inline bool PTIsImageType(PolyBlockType type) {
-  return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
-         type == PT_PULLOUT_IMAGE;
+  return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE || type == PT_PULLOUT_IMAGE;
 }
 /** Returns true if PolyBlockType is of text type */
 inline bool PTIsTextType(PolyBlockType type) {
-  return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
-         type == PT_PULLOUT_TEXT || type == PT_TABLE ||
-         type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
+  return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT || type == PT_PULLOUT_TEXT ||
+         type == PT_TABLE || type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
         type == PT_INLINE_EQUATION;
 }
 // Returns true if PolyBlockType is of pullout(inter-column) type
@ -89,7 +88,6 @@ inline bool PTIsPulloutType(PolyBlockType type) {
  return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
 }

-namespace tesseract {
 /**
 *  +------------------+  Orientation Example:
 *  | 1 Aaaa Aaaa Aaaa |  ====================
@ -157,26 +155,25 @@ enum TextlineOrder {
 * so that the inequality test macros below work.
 */
 enum PageSegMode {
-  PSM_OSD_ONLY = 0,       ///< Orientation and script detection only.
-  PSM_AUTO_OSD = 1,       ///< Automatic page segmentation with orientation and
-                          ///< script detection. (OSD)
-  PSM_AUTO_ONLY = 2,      ///< Automatic page segmentation, but no OSD, or OCR.
-  PSM_AUTO = 3,           ///< Fully automatic page segmentation, but no OSD.
-  PSM_SINGLE_COLUMN = 4,  ///< Assume a single column of text of variable sizes.
-  PSM_SINGLE_BLOCK_VERT_TEXT = 5,  ///< Assume a single uniform block of
-                                   ///< vertically aligned text.
-  PSM_SINGLE_BLOCK = 6,  ///< Assume a single uniform block of text. (Default.)
-  PSM_SINGLE_LINE = 7,   ///< Treat the image as a single text line.
-  PSM_SINGLE_WORD = 8,   ///< Treat the image as a single word.
-  PSM_CIRCLE_WORD = 9,   ///< Treat the image as a single word in a circle.
-  PSM_SINGLE_CHAR = 10,  ///< Treat the image as a single character.
-  PSM_SPARSE_TEXT =
-      11,  ///< Find as much text as possible in no particular order.
-  PSM_SPARSE_TEXT_OSD = 12,  ///< Sparse text with orientation and script det.
-  PSM_RAW_LINE = 13,  ///< Treat the image as a single text line, bypassing
-                      ///< hacks that are Tesseract-specific.
+  PSM_OSD_ONLY = 0,               ///< Orientation and script detection only.
+  PSM_AUTO_OSD = 1,               ///< Automatic page segmentation with orientation and
+                                  ///< script detection. (OSD)
+  PSM_AUTO_ONLY = 2,              ///< Automatic page segmentation, but no OSD, or OCR.
+  PSM_AUTO = 3,                   ///< Fully automatic page segmentation, but no OSD.
+  PSM_SINGLE_COLUMN = 4,          ///< Assume a single column of text of variable sizes.
+  PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
+                                  ///< vertically aligned text.
+  PSM_SINGLE_BLOCK = 6,           ///< Assume a single uniform block of text. (Default.)
+  PSM_SINGLE_LINE = 7,            ///< Treat the image as a single text line.
+  PSM_SINGLE_WORD = 8,            ///< Treat the image as a single word.
+  PSM_CIRCLE_WORD = 9,            ///< Treat the image as a single word in a circle.
+  PSM_SINGLE_CHAR = 10,           ///< Treat the image as a single character.
+  PSM_SPARSE_TEXT = 11,           ///< Find as much text as possible in no particular order.
+  PSM_SPARSE_TEXT_OSD = 12,       ///< Sparse text with orientation and script det.
+  PSM_RAW_LINE = 13,              ///< Treat the image as a single text line, bypassing
+                                  ///< hacks that are Tesseract-specific.

-  PSM_COUNT  ///< Number of enum entries.
+  PSM_COUNT ///< Number of enum entries.
 };

 /**
@ -214,11 +211,11 @@ inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
 * have 5x as many functions.
 */
 enum PageIteratorLevel {
-  RIL_BLOCK,     // Block of text/image/separator line.
-  RIL_PARA,      // Paragraph within a block.
-  RIL_TEXTLINE,  // Line within a paragraph.
-  RIL_WORD,      // Word within a textline.
-  RIL_SYMBOL     // Symbol/character within a word.
+  RIL_BLOCK,    // Block of text/image/separator line.
+  RIL_PARA,     // Paragraph within a block.
+  RIL_TEXTLINE, // Line within a paragraph.
+  RIL_WORD,     // Word within a textline.
+  RIL_SYMBOL    // Symbol/character within a word.
 };

 /**
@ -263,21 +260,21 @@ enum ParagraphJustification {
 * mention the connection to OcrEngineMode in the comments.
 */
 enum OcrEngineMode {
-  OEM_TESSERACT_ONLY,           // Run Tesseract only - fastest; deprecated
-  OEM_LSTM_ONLY,                // Run just the LSTM line recognizer.
-  OEM_TESSERACT_LSTM_COMBINED,  // Run the LSTM recognizer, but allow fallback
-                                // to Tesseract when things get difficult.
-                                // deprecated
-  OEM_DEFAULT,                  // Specify this mode when calling init_*(),
-                                // to indicate that any of the above modes
-                                // should be automatically inferred from the
-                                // variables in the language-specific config,
-                                // command-line configs, or if not specified
-                                // in any of the above should be set to the
-                                // default OEM_TESSERACT_ONLY.
-  OEM_COUNT                     // Number of OEMs
+  OEM_TESSERACT_ONLY,          // Run Tesseract only - fastest; deprecated
+  OEM_LSTM_ONLY,               // Run just the LSTM line recognizer.
+  OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
+                               // to Tesseract when things get difficult.
+                               // deprecated
+  OEM_DEFAULT,                 // Specify this mode when calling init_*(),
+                               // to indicate that any of the above modes
+                               // should be automatically inferred from the
+                               // variables in the language-specific config,
+                               // command-line configs, or if not specified
+                               // in any of the above should be set to the
+                               // default OEM_TESSERACT_ONLY.
+  OEM_COUNT                    // Number of OEMs
 };

-}  // namespace tesseract.
+} // namespace tesseract.

-#endif  // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
+#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
--- a/include/tesseract/renderer.h
+++ b/include/tesseract/renderer.h
@ -18,14 +18,13 @@
 #ifndef TESSERACT_API_RENDERER_H_
 #define TESSERACT_API_RENDERER_H_

+#include "export.h"
+
 // To avoid collision with other typenames include the ABSOLUTE MINIMUM
 // complexity of includes here. Use forward declarations wherever possible
 // and hide includes of complex types in baseapi.cpp.
-#include <string>  // for std::string
-
-#include "genericvector.h"
-#include "platform.h"
-#include "strngs.h"  // for STRING
+#include <string> // for std::string
+#include <vector> // for std::vector

 struct Pix;

@ -47,16 +46,16 @@ class TessBaseAPI;
 * in addition to the heuristics for producing it.
 */
 class TESS_API TessResultRenderer {
- public:
+public:
  virtual ~TessResultRenderer();

  // Takes ownership of pointer so must be new'd instance.
  // Renderers aren't ordered, but appends the sequences of next parameter
  // and existing next(). The renderers should be unique across both lists.
-  void insert(TessResultRenderer* next);
+  void insert(TessResultRenderer *next);

  // Returns the next renderer or nullptr.
-  TessResultRenderer* next() {
+  TessResultRenderer *next() {
    return next_;
  }

@ -65,7 +64,7 @@ class TESS_API TessResultRenderer {
   * This clears the contents of the output data.
   * Title should use UTF-8 encoding.
   */
-  bool BeginDocument(const char* title);
+  bool BeginDocument(const char *title);

  /**
   * Adds the recognized text from the source image to the current document.
@ -75,7 +74,7 @@ class TESS_API TessResultRenderer {
   * current TessBaseAPI implementation where the api has lots of state
   * information that we might want to add in.
   */
-  bool AddImage(TessBaseAPI* api);
+  bool AddImage(TessBaseAPI *api);

  /**
   * Finishes the document and finalizes the output data
@ -83,10 +82,10 @@ class TESS_API TessResultRenderer {
   */
  bool EndDocument();

-  const char* file_extension() const {
+  const char *file_extension() const {
    return file_extension_;
  }
-  const char* title() const {
+  const char *title() const {
    return title_.c_str();
  }

@ -108,7 +107,7 @@ class TESS_API TessResultRenderer {
    return imagenum_;
  }

- protected:
+protected:
  /**
   * Called by concrete classes.
   *
@ -119,13 +118,13 @@ class TESS_API TessResultRenderer {
   * files. For example "pdf" will produce a .pdf file, and "hocr"
   * will produce .hocr files.
   */
-  TessResultRenderer(const char* outputbase, const char* extension);
+  TessResultRenderer(const char *outputbase, const char *extension);

  // Hook for specialized handling in BeginDocument()
  virtual bool BeginDocumentHandler();

  // This must be overridden to render the OCR'd results
-  virtual bool AddImageHandler(TessBaseAPI* api) = 0;
+  virtual bool AddImageHandler(TessBaseAPI *api) = 0;

  // Hook for specialized handling in EndDocument()
  virtual bool EndDocumentHandler();
@ -133,62 +132,62 @@ class TESS_API TessResultRenderer {
  // Renderers can call this to append '\0' terminated strings into
  // the output string returned by GetOutput.
  // This method will grow the output buffer if needed.
-  void AppendString(const char* s);
+  void AppendString(const char *s);

  // Renderers can call this to append binary byte sequences into
  // the output string returned by GetOutput. Note that s is not necessarily
  // '\0' terminated (and can contain '\0' within it).
  // This method will grow the output buffer if needed.
-  void AppendData(const char* s, int len);
+  void AppendData(const char *s, int len);

- private:
-  const char* file_extension_;  // standard extension for generated output
-  STRING title_;                // title of document being rendered
-  int imagenum_;                // index of last image added
+private:
+  const char *file_extension_; // standard extension for generated output
+  std::string title_;          // title of document being rendered
+  int imagenum_;               // index of last image added

-  FILE* fout_;                // output file pointer
-  TessResultRenderer* next_;  // Can link multiple renderers together
-  bool happy_;                // I get grumpy when the disk fills up, etc.
+  FILE *fout_;               // output file pointer
+  TessResultRenderer *next_; // Can link multiple renderers together
+  bool happy_;               // I get grumpy when the disk fills up, etc.
 };

 /**
 * Renders tesseract output into a plain UTF-8 text string
 */
 class TESS_API TessTextRenderer : public TessResultRenderer {
- public:
-  explicit TessTextRenderer(const char* outputbase);
+public:
+  explicit TessTextRenderer(const char *outputbase);

- protected:
-  bool AddImageHandler(TessBaseAPI* api) override;
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
 };

 /**
 * Renders tesseract output into an hocr text string
 */
 class TESS_API TessHOcrRenderer : public TessResultRenderer {
- public:
-  explicit TessHOcrRenderer(const char* outputbase, bool font_info);
-  explicit TessHOcrRenderer(const char* outputbase);
+public:
+  explicit TessHOcrRenderer(const char *outputbase, bool font_info);
+  explicit TessHOcrRenderer(const char *outputbase);

- protected:
+protected:
  bool BeginDocumentHandler() override;
-  bool AddImageHandler(TessBaseAPI* api) override;
+  bool AddImageHandler(TessBaseAPI *api) override;
  bool EndDocumentHandler() override;

- private:
-  bool font_info_;  // whether to print font information
+private:
+  bool font_info_; // whether to print font information
 };

 /**
 * Renders tesseract output into an alto text string
 */
 class TESS_API TessAltoRenderer : public TessResultRenderer {
- public:
-  explicit TessAltoRenderer(const char* outputbase);
+public:
+  explicit TessAltoRenderer(const char *outputbase);

- protected:
+protected:
  bool BeginDocumentHandler() override;
-  bool AddImageHandler(TessBaseAPI* api) override;
+  bool AddImageHandler(TessBaseAPI *api) override;
  bool EndDocumentHandler() override;
 };

@ -196,99 +195,97 @@ class TESS_API TessAltoRenderer : public TessResultRenderer {
 * Renders Tesseract output into a TSV string
 */
 class TESS_API TessTsvRenderer : public TessResultRenderer {
- public:
-  explicit TessTsvRenderer(const char* outputbase, bool font_info);
-  explicit TessTsvRenderer(const char* outputbase);
+public:
+  explicit TessTsvRenderer(const char *outputbase, bool font_info);
+  explicit TessTsvRenderer(const char *outputbase);

- protected:
+protected:
  bool BeginDocumentHandler() override;
-  bool AddImageHandler(TessBaseAPI* api) override;
+  bool AddImageHandler(TessBaseAPI *api) override;
  bool EndDocumentHandler() override;

- private:
-  bool font_info_;  // whether to print font information
+private:
+  bool font_info_; // whether to print font information
 };

 /**
 * Renders tesseract output into searchable PDF
 */
 class TESS_API TessPDFRenderer : public TessResultRenderer {
- public:
+public:
  // datadir is the location of the TESSDATA. We need it because
  // we load a custom PDF font from this location.
-  TessPDFRenderer(const char* outputbase, const char* datadir,
-                  bool textonly = false);
+  TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly = false);

- protected:
+protected:
  bool BeginDocumentHandler() override;
-  bool AddImageHandler(TessBaseAPI* api) override;
+  bool AddImageHandler(TessBaseAPI *api) override;
  bool EndDocumentHandler() override;

- private:
+private:
  // We don't want to have every image in memory at once,
  // so we store some metadata as we go along producing
  // PDFs one page at a time. At the end, that metadata is
  // used to make everything that isn't easily handled in a
  // streaming fashion.
-  long int obj_;                     // counter for PDF objects
-  GenericVector<long int> offsets_;  // offset of every PDF object in bytes
-  GenericVector<long int> pages_;    // object number for every /Page object
-  std::string datadir_;              // where to find the custom font
-  bool textonly_;                    // skip images if set
+  long int obj_;                  // counter for PDF objects
+  std::vector<long int> offsets_; // offset of every PDF object in bytes
+  std::vector<long int> pages_;   // object number for every /Page object
+  std::string datadir_;           // where to find the custom font
+  bool textonly_;                 // skip images if set
  // Bookkeeping only. DIY = Do It Yourself.
  void AppendPDFObjectDIY(size_t objectsize);
  // Bookkeeping + emit data.
-  void AppendPDFObject(const char* data);
+  void AppendPDFObject(const char *data);
  // Create the /Contents object for an entire page.
-  char* GetPDFTextObjects(TessBaseAPI* api, double width, double height);
+  char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
  // Turn an image into a PDF object. Only transcode if we have to.
-  static bool imageToPDFObj(Pix* pix, const char* filename, long int objnum,
-                            char** pdf_object, long int* pdf_object_size,
-                            int jpg_quality);
+  static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum, char **pdf_object,
+                            long int *pdf_object_size, int jpg_quality);
 };

 /**
 * Renders tesseract output into a plain UTF-8 text string
 */
 class TESS_API TessUnlvRenderer : public TessResultRenderer {
- public:
-  explicit TessUnlvRenderer(const char* outputbase);
+public:
+  explicit TessUnlvRenderer(const char *outputbase);

- protected:
-  bool AddImageHandler(TessBaseAPI* api) override;
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
 };

 /**
 * Renders tesseract output into a plain UTF-8 text string for LSTMBox
 */
 class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
- public:
-  explicit TessLSTMBoxRenderer(const char* outputbase);
+public:
+  explicit TessLSTMBoxRenderer(const char *outputbase);

- protected:
-  bool AddImageHandler(TessBaseAPI* api) override;
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
 };

 /**
 * Renders tesseract output into a plain UTF-8 text string
 */
 class TESS_API TessBoxTextRenderer : public TessResultRenderer {
- public:
-  explicit TessBoxTextRenderer(const char* outputbase);
+public:
+  explicit TessBoxTextRenderer(const char *outputbase);

- protected:
-  bool AddImageHandler(TessBaseAPI* api) override;
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
 };

 /**
 * Renders tesseract output into a plain UTF-8 text string in WordStr format
 */
 class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
- public:
-  explicit TessWordStrBoxRenderer(const char* outputbase);
+public:
+  explicit TessWordStrBoxRenderer(const char *outputbase);

- protected:
-  bool AddImageHandler(TessBaseAPI* api) override;
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
 };

 #ifndef DISABLED_LEGACY_ENGINE
@ -297,15 +294,15 @@ class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
 * Renders tesseract output into an osd text string
 */
 class TESS_API TessOsdRenderer : public TessResultRenderer {
- public:
-  explicit TessOsdRenderer(const char* outputbase);
+public:
+  explicit TessOsdRenderer(const char *outputbase);

- protected:
-  bool AddImageHandler(TessBaseAPI* api) override;
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
 };

-#endif  // ndef DISABLED_LEGACY_ENGINE
+#endif // ndef DISABLED_LEGACY_ENGINE

-}  // namespace tesseract.
+} // namespace tesseract.

-#endif  // TESSERACT_API_RENDERER_H_
+#endif // TESSERACT_API_RENDERER_H_
--- a/include/tesseract/resultiterator.h
+++ b/include/tesseract/resultiterator.h
@ -4,7 +4,6 @@
 //              iterating in proper reading order over Bi Directional
 //              (e.g. mixed Hebrew and English) text.
 // Author:      David Eger
-// Created:     Fri May 27 13:58:06 PST 2011
 //
 // (C) Copyright 2011, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -22,28 +21,19 @@
 #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
 #define TESSERACT_CCMAIN_RESULT_ITERATOR_H_

-#include <set>     // for std::pair
-#include <vector>  // for std::vector
+#include "export.h"            // for TESS_API, TESS_LOCAL
+#include "ltrresultiterator.h" // for LTRResultIterator
+#include "publictypes.h"       // for PageIteratorLevel
+#include "unichar.h"           // for StrongScriptDirection

-#include "ltrresultiterator.h"  // for LTRResultIterator
-#include "platform.h"           // for TESS_API, TESS_LOCAL
-#include "publictypes.h"        // for PageIteratorLevel
-#include "unichar.h"            // for StrongScriptDirection
-
-template <typename T>
-class GenericVector;
-template <typename T>
-class GenericVectorEqEq;
-
-class STRING;
+#include <set>    // for std::pair
+#include <vector> // for std::vector

 namespace tesseract {

-class Tesseract;
-
 class TESS_API ResultIterator : public LTRResultIterator {
- public:
-  static ResultIterator* StartOfParagraph(const LTRResultIterator& resit);
+public:
+  static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);

  /**
   * ResultIterator is copy constructible!
@ -85,8 +75,7 @@ class TESS_API ResultIterator : public LTRResultIterator {
   * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
   * point at the last word in a paragraph.  See PageIterator for full comment.
   */
-  bool IsAtFinalElement(PageIteratorLevel level,
-                        PageIteratorLevel element) const override;
+  bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override;

  // ============= Functions that refer to words only ============.
  // Returns the number of blanks before the current word.
@ -98,15 +87,15 @@ class TESS_API ResultIterator : public LTRResultIterator {
   * Returns the null terminated UTF-8 encoded text string for the current
   * object at the given level. Use delete [] to free after use.
   */
-  virtual char* GetUTF8Text(PageIteratorLevel level) const;
+  virtual char *GetUTF8Text(PageIteratorLevel level) const;

  /**
   * Returns the LSTM choices for every LSTM timestep for the current word.
   */
-  virtual std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
-  GetRawLSTMTimesteps() const;
-  virtual std::vector<std::vector<std::pair<const char*, float>>>*
-  GetBestLSTMSymbolChoices() const;
+  virtual std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
+      *GetRawLSTMTimesteps() const;
+  virtual std::vector<std::vector<std::pair<const char *, float>>> *GetBestLSTMSymbolChoices()
+      const;

  /**
   * Return whether the current paragraph's dominant reading direction
@ -138,25 +127,24 @@ class TESS_API ResultIterator : public LTRResultIterator {
   * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
   *     { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
   */
-  static void CalculateTextlineOrder(
-      bool paragraph_is_ltr,
-      const GenericVector<StrongScriptDirection>& word_dirs,
-      GenericVectorEqEq<int>* reading_order);
+  static void CalculateTextlineOrder(bool paragraph_is_ltr,
+                                     const std::vector<StrongScriptDirection> &word_dirs,
+                                     std::vector<int> *reading_order);

  static const int kMinorRunStart;
  static const int kMinorRunEnd;
  static const int kComplexWord;

- protected:
+protected:
  /**
   * We presume the data associated with the given iterator will outlive us.
   * NB: This is private because it does something that is non-obvious:
   *   it resets to the beginning of the paragraph instead of staying wherever
   *   resit might have pointed.
   */
-  TESS_LOCAL explicit ResultIterator(const LTRResultIterator& resit);
+  explicit ResultIterator(const LTRResultIterator &resit);

- private:
+private:
  /**
   * Calculates the current paragraph's dominant writing direction.
   * Typically, members should use current_paragraph_ltr_ instead.
@ -174,14 +162,12 @@ class TESS_API ResultIterator : public LTRResultIterator {
   *   kComplexWord    The previous word contains both left-to-right and
   *                   right-to-left characters and was treated as neutral.
   */
-  void CalculateTextlineOrder(bool paragraph_is_ltr,
-                              const LTRResultIterator& resit,
-                              GenericVectorEqEq<int>* indices) const;
+  void CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
+                              std::vector<int> *indices) const;
  /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
-  void CalculateTextlineOrder(bool paragraph_is_ltr,
-                              const LTRResultIterator& resit,
-                              GenericVector<StrongScriptDirection>* ssd,
-                              GenericVectorEqEq<int>* indices) const;
+  void CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
+                              std::vector<StrongScriptDirection> *ssd,
+                              std::vector<int> *indices) const;

  /**
   * What is the index of the current word in a strict left-to-right reading
@ -193,7 +179,7 @@ class TESS_API ResultIterator : public LTRResultIterator {
   * Given an iterator pointing at a word, returns the logical reading order
   * of blob indices for the word.
   */
-  void CalculateBlobOrder(GenericVector<int>* blob_indices) const;
+  void CalculateBlobOrder(std::vector<int> *blob_indices) const;

  /** Precondition: current_paragraph_is_ltr_ is set. */
  void MoveToLogicalStartOfTextline();
@ -214,10 +200,10 @@ class TESS_API ResultIterator : public LTRResultIterator {
   * Append any extra marks that should be appended to this word when printed.
   * Mostly, these are Unicode BiDi control characters.
   */
-  void AppendSuffixMarks(STRING* text) const;
+  void AppendSuffixMarks(std::string *text) const;

  /** Appends the current word in reading order to the given buffer.*/
-  void AppendUTF8WordText(STRING* text) const;
+  void AppendUTF8WordText(std::string *text) const;

  /**
   * Appends the text of the current text line, *assuming this iterator is
@ -226,7 +212,7 @@ class TESS_API ResultIterator : public LTRResultIterator {
   * Each textline is terminated in a single newline character.
   * If the textline ends a paragraph, it gets a second terminal newline.
   */
-  void IterateAndAppendUTF8TextlineText(STRING* text);
+  void IterateAndAppendUTF8TextlineText(std::string *text);

  /**
   * Appends the text of the current paragraph in reading order
@ -234,7 +220,7 @@ class TESS_API ResultIterator : public LTRResultIterator {
   * Each textline is terminated in a single newline character, and the
   * paragraph gets an extra newline at the end.
   */
-  void AppendUTF8ParagraphText(STRING* text) const;
+  void AppendUTF8ParagraphText(std::string *text) const;

  /** Returns whether the bidi_debug flag is set to at least min_level. */
  bool BidiDebug(int min_level) const;
@ -257,6 +243,6 @@ class TESS_API ResultIterator : public LTRResultIterator {
  bool preserve_interword_spaces_;
 };

-}  // namespace tesseract.
+} // namespace tesseract.

-#endif  // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
+#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
--- a/include/tesseract/serialis.h
+++ b/include/tesseract/serialis.h
@ -1,163 +0,0 @@
-/**********************************************************************
- * File:        serialis.h  (Formerly serialmac.h)
- * Description: Inline routines and macros for serialisation functions
- * Author:      Phil Cheatle
- *
- * (C) Copyright 1990, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef SERIALIS_H
-#define SERIALIS_H
-
-#include <cstdint>  // uint8_t
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-
-template <typename T>
-class GenericVector;
-class STRING;
-
-/***********************************************************************
-  QUOTE_IT   MACRO DEFINITION
-  ===========================
-Replace <parm> with "<parm>".  <parm> may be an arbitrary number of tokens
-***********************************************************************/
-
-#define QUOTE_IT(parm) #parm
-
-namespace tesseract {
-
-// Return number of elements of an array.
-template <typename T, size_t N>
-constexpr size_t countof(T const (&)[N]) noexcept {
-  return N;
-}
-
-// Function to read a GenericVector<char> from a whole file.
-// Returns false on failure.
-using FileReader = bool (*)(const char* filename, GenericVector<char>* data);
-// Function to write a GenericVector<char> to a whole file.
-// Returns false on failure.
-using FileWriter = bool (*)(const GenericVector<char>& data,
-                            const char* filename);
-
-// Deserialize data from file.
-bool DeSerialize(FILE* fp, char* data, size_t n = 1);
-bool DeSerialize(FILE* fp, float* data, size_t n = 1);
-bool DeSerialize(FILE* fp, int8_t* data, size_t n = 1);
-bool DeSerialize(FILE* fp, int16_t* data, size_t n = 1);
-bool DeSerialize(FILE* fp, int32_t* data, size_t n = 1);
-bool DeSerialize(FILE* fp, uint8_t* data, size_t n = 1);
-bool DeSerialize(FILE* fp, uint16_t* data, size_t n = 1);
-bool DeSerialize(FILE* fp, uint32_t* data, size_t n = 1);
-
-// Serialize data to file.
-bool Serialize(FILE* fp, const char* data, size_t n = 1);
-bool Serialize(FILE* fp, const float* data, size_t n = 1);
-bool Serialize(FILE* fp, const int8_t* data, size_t n = 1);
-bool Serialize(FILE* fp, const int16_t* data, size_t n = 1);
-bool Serialize(FILE* fp, const int32_t* data, size_t n = 1);
-bool Serialize(FILE* fp, const uint8_t* data, size_t n = 1);
-bool Serialize(FILE* fp, const uint16_t* data, size_t n = 1);
-bool Serialize(FILE* fp, const uint32_t* data, size_t n = 1);
-
-// Simple file class.
-// Allows for portable file input from memory and from foreign file systems.
-class TFile {
- public:
-  TFile();
-  ~TFile();
-
-  // All the Open methods load the whole file into memory for reading.
-  // Opens a file with a supplied reader, or nullptr to use the default.
-  // Note that mixed read/write is not supported.
-  bool Open(const STRING& filename, FileReader reader);
-  // From an existing memory buffer.
-  bool Open(const char* data, int size);
-  // From an open file and an end offset.
-  bool Open(FILE* fp, int64_t end_offset);
-  // Sets the value of the swap flag, so that FReadEndian does the right thing.
-  void set_swap(bool value) {
-    swap_ = value;
-  }
-
-  // Deserialize data.
-  bool DeSerialize(char* data, size_t count = 1);
-  bool DeSerialize(double* data, size_t count = 1);
-  bool DeSerialize(float* data, size_t count = 1);
-  bool DeSerialize(int8_t* data, size_t count = 1);
-  bool DeSerialize(int16_t* data, size_t count = 1);
-  bool DeSerialize(int32_t* data, size_t count = 1);
-  bool DeSerialize(int64_t* data, size_t count = 1);
-  bool DeSerialize(uint8_t* data, size_t count = 1);
-  bool DeSerialize(uint16_t* data, size_t count = 1);
-  bool DeSerialize(uint32_t* data, size_t count = 1);
-  bool DeSerialize(uint64_t* data, size_t count = 1);
-
-  // Serialize data.
-  bool Serialize(const char* data, size_t count = 1);
-  bool Serialize(const double* data, size_t count = 1);
-  bool Serialize(const float* data, size_t count = 1);
-  bool Serialize(const int8_t* data, size_t count = 1);
-  bool Serialize(const int16_t* data, size_t count = 1);
-  bool Serialize(const int32_t* data, size_t count = 1);
-  bool Serialize(const int64_t* data, size_t count = 1);
-  bool Serialize(const uint8_t* data, size_t count = 1);
-  bool Serialize(const uint16_t* data, size_t count = 1);
-  bool Serialize(const uint32_t* data, size_t count = 1);
-  bool Serialize(const uint64_t* data, size_t count = 1);
-
-  // Skip data.
-  bool Skip(size_t count);
-
-  // Reads a line like fgets. Returns nullptr on EOF, otherwise buffer.
-  // Reads at most buffer_size bytes, including '\0' terminator, even if
-  // the line is longer. Does nothing if buffer_size <= 0.
-  char* FGets(char* buffer, int buffer_size);
-  // Replicates fread, followed by a swap of the bytes if needed, returning the
-  // number of items read. If swap_ is true then the count items will each have
-  // size bytes reversed.
-  int FReadEndian(void* buffer, size_t size, int count);
-  // Replicates fread, returning the number of items read.
-  int FRead(void* buffer, size_t size, int count);
-  // Resets the TFile as if it has been Opened, but nothing read.
-  // Only allowed while reading!
-  void Rewind();
-
-  // Open for writing. Either supply a non-nullptr data with OpenWrite before
-  // calling FWrite, (no close required), or supply a nullptr data to OpenWrite
-  // and call CloseWrite to write to a file after the FWrites.
-  void OpenWrite(GenericVector<char>* data);
-  bool CloseWrite(const STRING& filename, FileWriter writer);
-
-  // Replicates fwrite, returning the number of items written.
-  // To use fprintf, use snprintf and FWrite.
-  int FWrite(const void* buffer, size_t size, int count);
-
- private:
-  // The buffered data from the file.
-  GenericVector<char>* data_;
-  // The number of bytes used so far.
-  int offset_;
-  // True if the data_ pointer is owned by *this.
-  bool data_is_owned_;
-  // True if the TFile is open for writing.
-  bool is_writing_;
-  // True if bytes need to be swapped in FReadEndian.
-  bool swap_;
-};
-
-}  // namespace tesseract.
-
-#endif
--- a/include/tesseract/strngs.h
+++ b/include/tesseract/strngs.h
@ -1,188 +0,0 @@
-/**********************************************************************
- * File:        strngs.h  (Formerly strings.h)
- * Description: STRING class definition.
- * Author:      Ray Smith
- *
- * (C) Copyright 1991, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef STRNGS_H
-#define STRNGS_H
-
-#include <cassert>  // for assert
-#include <cstdint>  // for uint32_t
-#include <cstdio>   // for FILE
-#include <cstring>  // for strncpy
-
-#include "platform.h"  // for TESS_API
-
-namespace tesseract {
-class TFile;
-}  // namespace tesseract.
-
-// STRING_IS_PROTECTED means that  string[index] = X is invalid
-// because you have to go through strings interface to modify it.
-// This allows the string to ensure internal integrity and maintain
-// its own string length. Unfortunately this is not possible because
-// STRINGS are used as direct-manipulation data buffers for things
-// like length arrays and many places cast away the const on c_str()
-// to mutate the string. Turning this off means that internally we
-// cannot assume we know the strlen.
-#define STRING_IS_PROTECTED 0
-
-template <typename T>
-class GenericVector;
-
-class TESS_API STRING {
- public:
-  STRING();
-  STRING(const STRING& string);
-  STRING(const char* string);
-  STRING(const char* data, int length);
-  ~STRING();
-
-  // Writes to the given file. Returns false in case of error.
-  bool Serialize(FILE* fp) const;
-  // Reads from the given file. Returns false in case of error.
-  // If swap is true, assumes a big/little-endian swap is needed.
-  bool DeSerialize(bool swap, FILE* fp);
-  // Writes to the given file. Returns false in case of error.
-  bool Serialize(tesseract::TFile* fp) const;
-  // Reads from the given file. Returns false in case of error.
-  // If swap is true, assumes a big/little-endian swap is needed.
-  bool DeSerialize(tesseract::TFile* fp);
-  // As DeSerialize, but only seeks past the data - hence a static method.
-  static bool SkipDeSerialize(tesseract::TFile* fp);
-
-  bool contains(char c) const;
-  int32_t length() const;
-  int32_t size() const {
-    return length();
-  }
-  // Workaround to avoid g++ -Wsign-compare warnings.
-  uint32_t unsigned_size() const {
-    const int32_t len = length();
-    assert(0 <= len);
-    return static_cast<uint32_t>(len);
-  }
-  const char* c_str() const;
-
-  inline char* strdup() const {
-    int32_t len = length() + 1;
-    return strncpy(new char[len], GetCStr(), len);
-  }
-
-#if STRING_IS_PROTECTED
-  const char& operator[](int32_t index) const;
-  // len is number of chars in s to insert starting at index in this string
-  void insert_range(int32_t index, const char* s, int len);
-  void erase_range(int32_t index, int len);
-#else
-  char& operator[](int32_t index) const;
-#endif
-  void split(char c, GenericVector<STRING>* splited);
-  void truncate_at(int32_t index);
-
-  bool operator==(const STRING& string) const;
-  bool operator!=(const STRING& string) const;
-  bool operator!=(const char* string) const;
-
-  STRING& operator=(const char* string);
-  STRING& operator=(const STRING& string);
-
-  STRING operator+(const STRING& string) const;
-  STRING operator+(char ch) const;
-
-  STRING& operator+=(const char* string);
-  STRING& operator+=(const STRING& string);
-  STRING& operator+=(char ch);
-
-  // Assignment for strings which are not null-terminated.
-  void assign(const char* cstr, int len);
-
-  // Appends the given string and int (as a %d) to this.
-  // += cannot be used for ints as there as a char += operator that would
-  // be ambiguous, and ints usually need a string before or between them
-  // anyway.
-  void add_str_int(const char* str, int number);
-  // Appends the given string and double (as a %.8g) to this.
-  void add_str_double(const char* str, double number);
-
-  // ensure capacity but keep pointer encapsulated
-  inline void ensure(int32_t min_capacity) {
-    ensure_cstr(min_capacity);
-  }
-
- private:
-  typedef struct STRING_HEADER {
-    // How much space was allocated in the string buffer for char data.
-    int capacity_;
-
-    // used_ is how much of the capacity is currently being used,
-    // including a '\0' terminator.
-    //
-    // If used_ is 0 then string is nullptr (not even the '\0')
-    // else if used_ > 0 then it is strlen() + 1 (because it includes '\0')
-    // else strlen is >= 0 (not nullptr) but needs to be computed.
-    //      this condition is set when encapsulation is violated because
-    //      an API returned a mutable string.
-    //
-    // capacity_ - used_ = excess capacity that the string can grow
-    //                     without reallocating
-    mutable int used_;
-  } STRING_HEADER;
-
-  // To preserve the behavior of the old serialization, we only have space
-  // for one pointer in this structure. So we are embedding a data structure
-  // at the start of the storage that will hold additional state variables,
-  // then storing the actual string contents immediately after.
-  STRING_HEADER* data_;
-
-  // returns the header part of the storage
-  inline STRING_HEADER* GetHeader() {
-    return data_;
-  }
-  inline const STRING_HEADER* GetHeader() const {
-    return data_;
-  }
-
-  // returns the string data part of storage
-  inline char* GetCStr() {
-    return (reinterpret_cast<char*>(data_)) + sizeof(STRING_HEADER);
-  }
-
-  inline const char* GetCStr() const {
-    return (reinterpret_cast<const char*>(data_)) + sizeof(STRING_HEADER);
-  }
-  inline bool InvariantOk() const {
-#if STRING_IS_PROTECTED
-    return (GetHeader()->used_ == 0)
-               ? (c_str() == nullptr)
-               : (GetHeader()->used_ == (strlen(c_str()) + 1));
-#else
-    return true;
-#endif
-  }
-
-  // Ensure string has requested capacity as optimization
-  // to avoid unnecessary reallocations.
-  // The return value is a cstr buffer with at least requested capacity
-  char* ensure_cstr(int32_t min_capacity);
-
-  void FixHeader() const;  // make used_ non-negative, even if const
-
-  char* AllocData(int used, int capacity);
-  void DiscardData();
-};
-
-#endif
--- a/include/tesseract/thresholder.h
+++ b/include/tesseract/thresholder.h
@ -19,7 +19,7 @@
 #ifndef TESSERACT_CCMAIN_THRESHOLDER_H_
 #define TESSERACT_CCMAIN_THRESHOLDER_H_

-#include "platform.h"
+#include "export.h"
 #include "publictypes.h"

 struct Pix;
@ -33,7 +33,7 @@ namespace tesseract {
 /// be useful for multiple calls to SetRectangle and ThresholdTo* if
 /// desired.
 class TESS_API ImageThresholder {
- public:
+public:
  ImageThresholder();
  virtual ~ImageThresholder();

@ -51,8 +51,8 @@ class TESS_API ImageThresholder {
  /// Binary images of 1 bit per pixel may also be given but they must be
  /// byte packed with the MSB of the first byte being the first pixel, and a
  /// one pixel is WHITE. For binary images set bytes_per_pixel=0.
-  void SetImage(const unsigned char* imagedata, int width, int height,
-                int bytes_per_pixel, int bytes_per_line);
+  void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel,
+                int bytes_per_line);

  /// Store the coordinates of the rectangle to process for later use.
  /// Doesn't actually do any thresholding.
@ -62,8 +62,8 @@ class TESS_API ImageThresholder {
  /// original image (not just within the rectangle).
  /// Left and top are enough with top-down coordinates, but
  /// the height of the rectangle and the image are needed for bottom-up.
-  virtual void GetImageSizes(int* left, int* top, int* width, int* height,
-                             int* imagewidth, int* imageheight);
+  virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
+                             int *imageheight);

  /// Return true if the source image is color.
  bool IsColor() const {
@ -111,13 +111,13 @@ class TESS_API ImageThresholder {
  /// SetImage for Pix clones its input, so the source pix may be pixDestroyed
  /// immediately after, but may not go away until after the Thresholder has
  /// finished with it.
-  void SetImage(const Pix* pix);
+  void SetImage(const Pix *pix);

  /// Threshold the source image as efficiently as possible to the output Pix.
  /// Creates a Pix and sets pix to point to the resulting pointer.
  /// Caller must use pixDestroy to free the created Pix.
  /// Returns false on error.
-  virtual bool ThresholdToPix(PageSegMode pageseg_mode, Pix** pix);
+  virtual bool ThresholdToPix(PageSegMode pageseg_mode, Pix **pix);

  // Gets a pix that contains an 8 bit threshold value at each pixel. The
  // returned pix may be an integer reduction of the binary image such that
@ -126,22 +126,22 @@ class TESS_API ImageThresholder {
  // Ideally the 8 bit threshold should be the exact threshold used to generate
  // the binary image in ThresholdToPix, but this is not a hard constraint.
  // Returns nullptr if the input is binary. PixDestroy after use.
-  virtual Pix* GetPixRectThresholds();
+  virtual Pix *GetPixRectThresholds();

  /// Get a clone/copy of the source image rectangle.
  /// The returned Pix must be pixDestroyed.
  /// This function will be used in the future by the page layout analysis, and
  /// the layout analysis that uses it will only be available with Leptonica,
  /// so there is no raw equivalent.
-  Pix* GetPixRect();
+  Pix *GetPixRect();

  // Get a clone/copy of the source image rectangle, reduced to greyscale,
  // and at the same resolution as the output binary.
  // The returned Pix must be pixDestroyed.
  // Provided to the classifier to extract features from the greyscale image.
-  virtual Pix* GetPixRectGrey();
+  virtual Pix *GetPixRectGrey();

- protected:
+protected:
  // ----------------------------------------------------------------------
  // Utility functions that may be useful components for other thresholders.

@ -155,34 +155,34 @@ class TESS_API ImageThresholder {
  }

  // Otsu thresholds the rectangle, taking the rectangle from *this.
-  void OtsuThresholdRectToPix(Pix* src_pix, Pix** out_pix) const;
+  void OtsuThresholdRectToPix(Pix *src_pix, Pix **out_pix) const;

  /// Threshold the rectangle, taking everything except the src_pix
  /// from the class, using thresholds/hi_values to the output pix.
  /// NOTE that num_channels is the size of the thresholds and hi_values
  // arrays and also the bytes per pixel in src_pix.
-  void ThresholdRectToPix(Pix* src_pix, int num_channels, const int* thresholds,
-                          const int* hi_values, Pix** pix) const;
+  void ThresholdRectToPix(Pix *src_pix, int num_channels, const int *thresholds,
+                          const int *hi_values, Pix **pix) const;

- protected:
+protected:
  /// Clone or other copy of the source Pix.
  /// The pix will always be PixDestroy()ed on destruction of the class.
-  Pix* pix_;
+  Pix *pix_;

-  int image_width_;   ///< Width of source pix_.
-  int image_height_;  ///< Height of source pix_.
-  int pix_channels_;  ///< Number of 8-bit channels in pix_.
-  int pix_wpl_;       ///< Words per line of pix_.
+  int image_width_;  ///< Width of source pix_.
+  int image_height_; ///< Height of source pix_.
+  int pix_channels_; ///< Number of 8-bit channels in pix_.
+  int pix_wpl_;      ///< Words per line of pix_.
  // Limits of image rectangle to be processed.
-  int scale_;          ///< Scale factor from original image.
-  int yres_;           ///< y pixels/inch in source image.
-  int estimated_res_;  ///< Resolution estimate from text size.
+  int scale_;         ///< Scale factor from original image.
+  int yres_;          ///< y pixels/inch in source image.
+  int estimated_res_; ///< Resolution estimate from text size.
  int rect_left_;
  int rect_top_;
  int rect_width_;
  int rect_height_;
 };

-}  // namespace tesseract.
+} // namespace tesseract.

-#endif  // TESSERACT_CCMAIN_THRESHOLDER_H_
+#endif // TESSERACT_CCMAIN_THRESHOLDER_H_
--- a/include/tesseract/unichar.h
+++ b/include/tesseract/unichar.h
@ -19,13 +19,14 @@
 #ifndef TESSERACT_CCUTIL_UNICHAR_H_
 #define TESSERACT_CCUTIL_UNICHAR_H_

-#include <memory.h>
+#include "export.h"

+#include <memory.h>
 #include <cstring>
 #include <string>
 #include <vector>

-#include "platform.h"
+namespace tesseract {

 // Maximum number of characters that can be stored in a UNICHAR. Must be
 // at least 4. Must not exceed 31 without changing the coding of length.
@ -41,23 +42,21 @@ static const int INVALID_UNICHAR_ID = -1;
 static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";

 enum StrongScriptDirection {
-  DIR_NEUTRAL = 0,        // Text contains only neutral characters.
-  DIR_LEFT_TO_RIGHT = 1,  // Text contains no Right-to-Left characters.
-  DIR_RIGHT_TO_LEFT = 2,  // Text contains no Left-to-Right characters.
-  DIR_MIX = 3,            // Text contains a mixture of left-to-right
-                          // and right-to-left characters.
+  DIR_NEUTRAL = 0,       // Text contains only neutral characters.
+  DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
+  DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
+  DIR_MIX = 3,           // Text contains a mixture of left-to-right
+                         // and right-to-left characters.
 };

-namespace tesseract {
-
 using char32 = signed int;

 // The UNICHAR class holds a single classification result. This may be
 // a single Unicode character (stored as between 1 and 4 utf8 bytes) or
 // multiple Unicode characters representing the NFKC expansion of a ligature
 // such as fi, ffl etc. These are also stored as utf8.
-class UNICHAR {
- public:
+class TESS_API UNICHAR {
+public:
  UNICHAR() {
    memset(chars, 0, UNICHAR_LEN);
  }
@ -65,7 +64,7 @@ class UNICHAR {
  // Construct from a utf8 string. If len<0 then the string is null terminated.
  // If the string is too long to fit in the UNICHAR then it takes only what
  // will fit.
-  UNICHAR(const char* utf8_str, int len);
+  UNICHAR(const char *utf8_str, int len);

  // Construct from a single UCS4 character.
  explicit UNICHAR(int unicode);
@ -82,15 +81,15 @@ class UNICHAR {
  }

  // Get a UTF8 string, but NOT nullptr terminated.
-  const char* utf8() const {
+  const char *utf8() const {
    return chars;
  }

  // Get a terminated UTF8 string: Must delete[] it after use.
-  char* utf8_str() const;
+  char *utf8_str() const;

  // Get the number of bytes in the first character of the given utf8 string.
-  static int utf8_step(const char* utf8_str);
+  static int utf8_step(const char *utf8_str);

  // A class to simplify iterating over and accessing elements of a UTF8
  // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
@ -106,15 +105,15 @@ class UNICHAR {
  //     int char_len = it.get_utf8(buf); buf[char_len] = '\0';
  //     tprintf("Char = %s\n", buf);
  //   }
-  class const_iterator {
+  class TESS_API const_iterator {
    using CI = const_iterator;

-   public:
+  public:
    // Step to the next UTF8 character.
    // If the current position is at an illegal UTF8 character, then print an
    // error message and step by one byte. If the current position is at a
    // nullptr value, don't step past it.
-    const_iterator& operator++();
+    const_iterator &operator++();

    // Return the UCS-4 value at the current position.
    // If the current position is at an illegal UTF8 value, return a single
@ -126,7 +125,7 @@ class UNICHAR {
    // If the current position is at an illegal UTF8 value, writes a single
    // space character and returns 1.
    // Note that this method does not null-terminate the buffer.
-    int get_utf8(char* buf) const;
+    int get_utf8(char *buf) const;
    // Returns the number of bytes of the current codepoint. Returns 1 if the
    // current position is at an illegal UTF8 value.
    int utf8_len() const;
@ -134,45 +133,45 @@ class UNICHAR {
    bool is_legal() const;

    // Return the pointer into the string at the current position.
-    const char* utf8_data() const {
+    const char *utf8_data() const {
      return it_;
    }

    // Iterator equality operators.
-    friend bool operator==(const CI& lhs, const CI& rhs) {
+    friend bool operator==(const CI &lhs, const CI &rhs) {
      return lhs.it_ == rhs.it_;
    }
-    friend bool operator!=(const CI& lhs, const CI& rhs) {
+    friend bool operator!=(const CI &lhs, const CI &rhs) {
      return !(lhs == rhs);
    }

-   private:
+  private:
    friend class UNICHAR;
-    explicit const_iterator(const char* it) : it_(it) {}
+    explicit const_iterator(const char *it) : it_(it) {}

-    const char* it_;  // Pointer into the string.
+    const char *it_; // Pointer into the string.
  };

  // Create a start/end iterator pointing to a string. Note that these methods
  // are static and do NOT create a copy or take ownership of the underlying
  // array.
-  static const_iterator begin(const char* utf8_str, int byte_length);
-  static const_iterator end(const char* utf8_str, int byte_length);
+  static const_iterator begin(const char *utf8_str, int byte_length);
+  static const_iterator end(const char *utf8_str, int byte_length);

  // Converts a utf-8 string to a vector of unicodes.
  // Returns an empty vector if the input contains invalid UTF-8.
-  static std::vector<char32> UTF8ToUTF32(const char* utf8_str);
+  static std::vector<char32> UTF8ToUTF32(const char *utf8_str);
  // Converts a vector of unicodes to a utf8 string.
  // Returns an empty string if the input contains an invalid unicode.
-  static std::string UTF32ToUTF8(const std::vector<char32>& str32);
+  static std::string UTF32ToUTF8(const std::vector<char32> &str32);

- private:
+private:
  // A UTF-8 representation of 1 or more Unicode characters.
  // The last element (chars[UNICHAR_LEN - 1]) is a length if
  // its value < UNICHAR_LEN, otherwise it is a genuine character.
  char chars[UNICHAR_LEN]{};
 };

-}  // namespace tesseract
+} // namespace tesseract

-#endif  // TESSERACT_CCUTIL_UNICHAR_H_
+#endif // TESSERACT_CCUTIL_UNICHAR_H_
--- a/include/tesseract/version.h.in
+++ b/include/tesseract/version.h.in
@ -18,13 +18,11 @@
 #ifndef TESSERACT_API_VERSION_H_
 #define TESSERACT_API_VERSION_H_

-#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@
-#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@
-#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@
+#define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION @
+#define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION @
+#define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION @
 #define TESSERACT_VERSION \
-  (TESSERACT_MAJOR_VERSION << 16 | \
-   TESSERACT_MINOR_VERSION << 8 | \
-   TESSERACT_MICRO_VERSION)
+  (TESSERACT_MAJOR_VERSION << 16 | TESSERACT_MINOR_VERSION << 8 | TESSERACT_MICRO_VERSION)
 #define TESSERACT_VERSION_STR "@PACKAGE_VERSION@"

-#endif  // TESSERACT_API_VERSION_H_
+#endif // TESSERACT_API_VERSION_H_
--- a/java/Makefile.am
+++ b/java/Makefile.am
@ -36,11 +36,11 @@ SCROLLVIEW_CLASSES = \
 	com/google/scrollview/ScrollView.class

 SCROLLVIEW_LIBS = \
-	piccolo2d-core-3.0.jar \
-	piccolo2d-extras-3.0.jar \
+	piccolo2d-core-3.0.1.jar \
+	piccolo2d-extras-3.0.1.jar \
 	jaxb-api-2.3.1.jar

-CLASSPATH = piccolo2d-core-3.0.jar:piccolo2d-extras-3.0.jar:jaxb-api-2.3.1.jar
+CLASSPATH = piccolo2d-core-3.0.1.jar:piccolo2d-extras-3.0.1.jar:jaxb-api-2.3.1.jar

 ScrollView.jar : $(SCROLLVIEW_CLASSES)
 	$(JAR) cfm $@ $(srcdir)/Manifest.txt com/google/scrollview/*.class \
@ -51,9 +51,9 @@ $(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES) $(SCROLLVIEW_LIBS)

 .PHONY: fetch-jars
 fetch-jars $(SCROLLVIEW_LIBS):
-	curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0/piccolo2d-core-3.0.jar > piccolo2d-core-3.0.jar
-	curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0/piccolo2d-extras-3.0.jar > piccolo2d-extras-3.0.jar
-	curl -L http://search.maven.org/remotecontent?filepath=javax/xml/bind/jaxb-api/2.3.1/jaxb-api-2.3.1.jar > jaxb-api-2.3.1.jar
+	curl -s -S -L -O https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0.1/piccolo2d-core-3.0.1.jar
+	curl -s -S -L -O https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0.1/piccolo2d-extras-3.0.1.jar
+	curl -s -S -L -O https://search.maven.org/remotecontent?filepath=javax/xml/bind/jaxb-api/2.3.1/jaxb-api-2.3.1.jar

 .PHONY: install-jars
 install-jars : ScrollView.jar
--- a/java/Manifest.txt
+++ b/java/Manifest.txt
@ -1,2 +1,2 @@
 Main-Class: com/google/scrollview/ScrollView
-Class-Path: ScrollView.jar piccolo2d-core-3.0.jar piccolo2d-extras-3.0.jar jaxb-api-2.3.1.jar
+Class-Path: ScrollView.jar piccolo2d-core-3.0.1.jar piccolo2d-extras-3.0.1.jar jaxb-api-2.3.1.jar
--- a/src/api/altorenderer.cpp
+++ b/src/api/altorenderer.cpp
@ -13,22 +13,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <memory>
-#include <sstream>  // for std::stringstream
-#include <tesseract/baseapi.h>
 #ifdef _WIN32
-# include "host.h"    // windows.h for MultiByteToWideChar, ...
+#  include "host.h" // windows.h for MultiByteToWideChar, ...
 #endif
+
+#include <tesseract/baseapi.h>
 #include <tesseract/renderer.h>
-#include <tesseract/strngs.h> // for STRING
+
+#include <memory>
+#include <sstream> // for std::stringstream

 namespace tesseract {

 /// Add coordinates to specified TextBlock, TextLine or String bounding box.
 /// Add word confidence if adding to a String bounding box.
 ///
-static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level,
-                         std::stringstream& alto_str) {
+static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
+                         std::stringstream &alto_str) {
  int left, top, right, bottom;
  it->BoundingBox(level, &left, &top, &right, &bottom);

@ -90,9 +91,10 @@ bool TessAltoRenderer::BeginDocumentHandler() {
 ///
 /// Append the ALTO XML for the layout of the image
 ///
-bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) {
+bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
  const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
-  if (text == nullptr) return false;
+  if (text == nullptr)
+    return false;

  AppendString(text.get());

@ -108,14 +110,14 @@ bool TessAltoRenderer::EndDocumentHandler() {
  return true;
 }

-TessAltoRenderer::TessAltoRenderer(const char* outputbase)
+TessAltoRenderer::TessAltoRenderer(const char *outputbase)
    : TessResultRenderer(outputbase, "xml") {}

 ///
 /// Make an XML-formatted string with ALTO markup from the internal
 /// data structures.
 ///
-char* TessBaseAPI::GetAltoText(int page_number) {
+char *TessBaseAPI::GetAltoText(int page_number) {
  return GetAltoText(nullptr, page_number);
 }

@ -123,27 +125,26 @@ char* TessBaseAPI::GetAltoText(int page_number) {
 /// Make an XML-formatted string with ALTO markup from the internal
 /// data structures.
 ///
-char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
+char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
    return nullptr;

  int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;

-  if (input_file_ == nullptr) SetInputName(nullptr);
+  if (input_file_.empty()) {
+    SetInputName(nullptr);
+  }

 #ifdef _WIN32
  // convert input name from ANSI encoding to utf-8
-  int str16_len =
-      MultiByteToWideChar(CP_ACP, 0, input_file_->c_str(), -1, nullptr, 0);
-  wchar_t* uni16_str = new WCHAR[str16_len];
-  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->c_str(), -1,
-                                  uni16_str, str16_len);
-  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
-                                     0, nullptr, nullptr);
-  char* utf8_str = new char[utf8_len];
-  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
-                      nullptr, nullptr);
-  *input_file_ = utf8_str;
+  int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
+  wchar_t *uni16_str = new WCHAR[str16_len];
+  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
+  int utf8_len =
+      WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
+  char *utf8_str = new char[utf8_len];
+  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
+  input_file_ = utf8_str;
  delete[] uni16_str;
  delete[] utf8_str;
 #endif
@ -151,16 +152,14 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
  std::stringstream alto_str;
  // Use "C" locale (needed for int values larger than 999).
  alto_str.imbue(std::locale::classic());
-  alto_str
-      << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\""
-      << rect_height_
-      << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
-      << " ID=\"page_" << page_number << "\">\n"
-      << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
-      << " WIDTH=\"" << rect_width_ << "\""
-      << " HEIGHT=\"" << rect_height_ << "\">\n";
+  alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
+           << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
+           << " ID=\"page_" << page_number << "\">\n"
+           << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
+           << " WIDTH=\"" << rect_width_ << "\""
+           << " HEIGHT=\"" << rect_height_ << "\">\n";

-  ResultIterator* res_it = GetIterator();
+  ResultIterator *res_it = GetIterator();
  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->Empty(RIL_WORD)) {
      res_it->Next(RIL_WORD);
@ -193,13 +192,11 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
    bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
    bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);

-
    int left, top, right, bottom;
    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);

    do {
-      const std::unique_ptr<const char[]> grapheme(
-          res_it->GetUTF8Text(RIL_SYMBOL));
+      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
      if (grapheme && grapheme[0] != 0) {
        alto_str << HOcrEscape(grapheme.get()).c_str();
      }
@ -218,8 +215,8 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
      int vpos = top;
      res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
      int width = left - hpos;
-      alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos
-               << "\" HPOS=\"" << hpos << "\"/>\n";
+      alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
+               << "\"/>\n";
    }

    if (last_word_in_tblock) {
@ -235,12 +232,12 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {

  alto_str << "\t\t\t</PrintSpace>\n"
           << "\t\t</Page>\n";
-  const std::string& text = alto_str.str();
+  const std::string &text = alto_str.str();

-  char* result = new char[text.length() + 1];
+  char *result = new char[text.length() + 1];
  strcpy(result, text.c_str());
  delete res_it;
  return result;
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/api/baseapi.cpp
+++ b/src/api/baseapi.cpp
--- a/src/api/capi.cpp
+++ b/src/api/capi.cpp
--- a/src/api/hocrrenderer.cpp
+++ b/src/api/hocrrenderer.cpp
@ -17,28 +17,27 @@
 *
 **********************************************************************/

-#include <locale>     // for std::locale::classic
-#include <memory>     // for std::unique_ptr
-#include <sstream>    // for std::stringstream
-#include <tesseract/baseapi.h>  // for TessBaseAPI
+#include <tesseract/baseapi.h> // for TessBaseAPI
+#include <locale>              // for std::locale::classic
+#include <memory>              // for std::unique_ptr
+#include <sstream>             // for std::stringstream
 #ifdef _WIN32
-# include "host.h"    // windows.h for MultiByteToWideChar, ...
+#  include "host.h" // windows.h for MultiByteToWideChar, ...
 #endif
 #include <tesseract/renderer.h>
-#include "tesseractclass.h"  // for Tesseract
+#include "tesseractclass.h" // for Tesseract

 namespace tesseract {

 /**
 * Gets the block orientation at the current iterator position.
 */
-static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) {
+static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
  tesseract::Orientation orientation;
  tesseract::WritingDirection writing_direction;
  tesseract::TextlineOrder textline_order;
  float deskew_angle;
-  it->Orientation(&orientation, &writing_direction, &textline_order,
-                  &deskew_angle);
+  it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
  return orientation;
 }

@ -50,9 +49,8 @@ static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) {
 * method currently only inserts a 'textangle' property to indicate the rotation
 * direction and does not add any baseline information to the hocr string.
 */
-static void AddBaselineCoordsTohOCR(const PageIterator* it,
-                                    PageIteratorLevel level,
-                                    std::stringstream& hocr_str) {
+static void AddBaselineCoordsTohOCR(const PageIterator *it, PageIteratorLevel level,
+                                    std::stringstream &hocr_str) {
  tesseract::Orientation orientation = GetBlockTextOrientation(it);
  if (orientation != ORIENTATION_PAGE_UP) {
    hocr_str << "; textangle " << 360 - orientation * 90;
@ -64,7 +62,8 @@ static void AddBaselineCoordsTohOCR(const PageIterator* it,

  // Try to get the baseline coordinates at this level.
  int x1, y1, x2, y2;
-  if (!it->Baseline(level, &x1, &y1, &x2, &y2)) return;
+  if (!it->Baseline(level, &x1, &y1, &x2, &y2))
+    return;
  // Following the description of this field of the hOCR spec, we convert the
  // baseline coordinates so that "the bottom left of the bounding box is the
  // origin".
@ -82,27 +81,25 @@ static void AddBaselineCoordsTohOCR(const PageIterator* it,
  double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
  double p0 = y1 - p1 * x1;

-  hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
-           << round(p0 * 1000.0) / 1000.0;
+  hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " << round(p0 * 1000.0) / 1000.0;
 }

-static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
-                         std::stringstream& hocr_str) {
+static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
+                         std::stringstream &hocr_str) {
  int left, top, right, bottom;
  it->BoundingBox(level, &left, &top, &right, &bottom);
  // This is the only place we use double quotes instead of single quotes,
  // but it may too late to change for consistency
-  hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
-           << bottom;
+  hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " << bottom;
  // Add baseline coordinates & heights for textlines only.
  if (level == RIL_TEXTLINE) {
    AddBaselineCoordsTohOCR(it, level, hocr_str);
    // add custom height measures
-    float row_height, descenders, ascenders;  // row attributes
+    float row_height, descenders, ascenders; // row attributes
    it->RowAttributes(&row_height, &descenders, &ascenders);
    // TODO(rays): Do we want to limit these to a single decimal place?
-    hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
-             << "; x_ascenders " << ascenders;
+    hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders << "; x_ascenders "
+             << ascenders;
  }
  hocr_str << "\">";
 }
@ -116,7 +113,7 @@ static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
 * STL removed from original patch submission and refactored by rays.
 * Returned string must be freed with the delete [] operator.
 */
-char* TessBaseAPI::GetHOCRText(int page_number) {
+char *TessBaseAPI::GetHOCRText(int page_number) {
  return GetHOCRText(nullptr, page_number);
 }

@ -129,34 +126,32 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
 * STL removed from original patch submission and refactored by rays.
 * Returned string must be freed with the delete [] operator.
 */
-char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
+char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
    return nullptr;

  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
-  int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
-  bool para_is_ltr = true;        // Default direction is LTR
-  const char* paragraph_lang = nullptr;
+  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
+  bool para_is_ltr = true;       // Default direction is LTR
+  const char *paragraph_lang = nullptr;
  bool font_info = false;
  bool hocr_boxes = false;
  GetBoolVariable("hocr_font_info", &font_info);
  GetBoolVariable("hocr_char_boxes", &hocr_boxes);

-  if (input_file_ == nullptr) SetInputName(nullptr);
+  if (input_file_.empty())
+    SetInputName(nullptr);

 #ifdef _WIN32
  // convert input name from ANSI encoding to utf-8
-  int str16_len =
-      MultiByteToWideChar(CP_ACP, 0, input_file_->c_str(), -1, nullptr, 0);
-  wchar_t* uni16_str = new WCHAR[str16_len];
-  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->c_str(), -1,
-                                  uni16_str, str16_len);
-  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
-                                     0, nullptr, nullptr);
-  char* utf8_str = new char[utf8_len];
-  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
-                      nullptr, nullptr);
-  *input_file_ = utf8_str;
+  int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
+  wchar_t *uni16_str = new WCHAR[str16_len];
+  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
+  int utf8_len =
+      WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
+  char *utf8_str = new char[utf8_len];
+  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
+  input_file_ = utf8_str;
  delete[] uni16_str;
  delete[] utf8_str;
 #endif
@ -170,14 +165,13 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
  hocr_str << " id='"
           << "page_" << page_id << "'";
  hocr_str << " title='image \"";
-  if (input_file_) {
-    hocr_str << HOcrEscape(input_file_->c_str()).c_str();
+  if (!input_file_.empty()) {
+    hocr_str << HOcrEscape(input_file_.c_str()).c_str();
  } else {
    hocr_str << "unknown";
  }
-  hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
-           << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
-           << "'>\n";
+  hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " << rect_width_ << " "
+           << rect_height_ << "; ppageno " << page_number << "'>\n";

  std::unique_ptr<ResultIterator> res_it(GetIterator());
  while (!res_it->Empty(RIL_BLOCK)) {
@ -188,7 +182,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {

    // Open any new block/paragraph/textline.
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
-      para_is_ltr = true;  // reset to default direction
+      para_is_ltr = true; // reset to default direction
      hocr_str << "   <div class='ocr_carea'"
               << " id='"
               << "block_" << page_id << "_" << bcnt << "'";
@ -230,12 +224,9 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {

    // Now, process the word...
    int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
-    std::vector<std::vector<std::vector<std::pair<const char*, float>>>>* rawTimestepMap =
-        nullptr;
-    std::vector<std::vector<std::pair<const char*, float>>>* CTCMap =
-        nullptr;
+    std::vector<std::vector<std::vector<std::pair<const char *, float>>>> *rawTimestepMap = nullptr;
+    std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
    if (lstm_choice_mode) {
-
      CTCMap = res_it->GetBestLSTMSymbolChoices();
      rawTimestepMap = res_it->GetRawLSTMTimesteps();
    }
@ -245,14 +236,12 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
    int left, top, right, bottom;
    bool bold, italic, underlined, monospace, serif, smallcaps;
    int pointsize, font_id;
-    const char* font_name;
+    const char *font_name;
    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
-    font_name =
-        res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
-                                   &serif, &smallcaps, &pointsize, &font_id);
-    hocr_str << " title='bbox " << left << " " << top << " " << right << " "
-             << bottom << "; x_wconf "
-             << static_cast<int>(res_it->Confidence(RIL_WORD));
+    font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
+                                           &smallcaps, &pointsize, &font_id);
+    hocr_str << " title='bbox " << left << " " << top << " " << right << " " << bottom
+             << "; x_wconf " << static_cast<int>(res_it->Confidence(RIL_WORD));
    if (font_info) {
      if (font_name) {
        hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
@ -260,86 +249,82 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
      hocr_str << "; x_fsize " << pointsize;
    }
    hocr_str << "'";
-    const char* lang = res_it->WordRecognitionLanguage();
+    const char *lang = res_it->WordRecognitionLanguage();
    if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
      hocr_str << " lang='" << lang << "'";
    }
    switch (res_it->WordDirection()) {
      // Only emit direction if different from current paragraph direction
      case DIR_LEFT_TO_RIGHT:
-        if (!para_is_ltr) hocr_str << " dir='ltr'";
+        if (!para_is_ltr)
+          hocr_str << " dir='ltr'";
        break;
      case DIR_RIGHT_TO_LEFT:
-        if (para_is_ltr) hocr_str << " dir='rtl'";
+        if (para_is_ltr)
+          hocr_str << " dir='rtl'";
        break;
      case DIR_MIX:
      case DIR_NEUTRAL:
-      default:  // Do nothing.
+      default: // Do nothing.
        break;
    }
    hocr_str << ">";
    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
-    if (bold) hocr_str << "<strong>";
-    if (italic) hocr_str << "<em>";
+    if (bold)
+      hocr_str << "<strong>";
+    if (italic)
+      hocr_str << "<em>";
    do {
-      const std::unique_ptr<const char[]> grapheme(
-          res_it->GetUTF8Text(RIL_SYMBOL));
+      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
      if (grapheme && grapheme[0] != 0) {
        if (hocr_boxes) {
          res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
-          hocr_str << "\n       <span class='ocrx_cinfo' title='x_bboxes "
-                   << left << " " << top << " " << right << " " << bottom
-                   << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
+          hocr_str << "\n       <span class='ocrx_cinfo' title='x_bboxes " << left << " " << top
+                   << " " << right << " " << bottom << "; x_conf " << res_it->Confidence(RIL_SYMBOL)
+                   << "'>";
        }
        hocr_str << HOcrEscape(grapheme.get()).c_str();
        if (hocr_boxes) {
          hocr_str << "</span>";
          tesseract::ChoiceIterator ci(*res_it);
          if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
-            std::vector<std::vector<std::pair<const char*, float>>>* symbol =
-                ci.Timesteps();
-              hocr_str << "\n        <span class='ocr_symbol'"
+            std::vector<std::vector<std::pair<const char *, float>>> *symbol = ci.Timesteps();
+            hocr_str << "\n        <span class='ocr_symbol'"
+                     << " id='"
+                     << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
+            for (auto timestep : *symbol) {
+              hocr_str << "\n         <span class='ocrx_cinfo'"
                       << " id='"
-                       << "symbol_" << page_id << "_" << wcnt << "_" << scnt
-                       << "'>";
-              for (auto timestep : *symbol) {
-                hocr_str << "\n         <span class='ocrx_cinfo'"
+                       << "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
+              for (auto conf : timestep) {
+                hocr_str << "\n          <span class='ocrx_cinfo'"
                         << " id='"
-                         << "timestep" << page_id << "_" << wcnt << "_" << tcnt
-                         << "'>";
-                for (auto conf : timestep) {
-                  hocr_str << "\n          <span class='ocrx_cinfo'"
-                           << " id='"
-                           << "choice_" << page_id << "_" << wcnt << "_" << ccnt
-                           << "'"
-                           << " title='x_confs " << int(conf.second * 100)
-                           << "'>" << HOcrEscape(conf.first).c_str()
-                           << "</span>";
-                  ++ccnt;
-                }
-                hocr_str << "</span>";
-                ++tcnt;
+                         << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
+                         << " title='x_confs " << int(conf.second * 100) << "'>"
+                         << HOcrEscape(conf.first).c_str() << "</span>";
+                ++ccnt;
              }
-              hocr_str << "\n        </span>";
-              ++scnt;
+              hocr_str << "</span>";
+              ++tcnt;
+            }
+            hocr_str << "\n        </span>";
+            ++scnt;
          } else if (lstm_choice_mode == 2) {
            tesseract::ChoiceIterator ci(*res_it);
            hocr_str << "\n        <span class='ocrx_cinfo'"
                     << " id='"
-                     << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
-                     << "'>";
+                     << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
            do {
-              const char* choice = ci.GetUTF8Text();
+              const char *choice = ci.GetUTF8Text();
              float choiceconf = ci.Confidence();
              if (choice != nullptr) {
                hocr_str << "\n         <span class='ocrx_cinfo'"
                         << " id='"
-                         << "choice_" << page_id << "_" << wcnt << "_" << ccnt
-                         << "'"
-                         << " title='x_confs " << choiceconf << "'>"
-                         << HOcrEscape(choice).c_str() << "</span>";
+                         << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
+                         << " title='x_confs " << choiceconf << "'>" << HOcrEscape(choice).c_str()
+                         << "</span>";
                ccnt++;
              }
            } while (ci.Next());
@ -350,8 +335,10 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
      }
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
-    if (italic) hocr_str << "</em>";
-    if (bold) hocr_str << "</strong>";
+    if (italic)
+      hocr_str << "</em>";
+    if (bold)
+      hocr_str << "</strong>";
    // If the lstm choice mode is required it is added here
    if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
      for (auto symbol : *rawTimestepMap) {
@ -361,13 +348,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
        for (auto timestep : symbol) {
          hocr_str << "\n        <span class='ocrx_cinfo'"
                   << " id='"
-                   << "timestep" << page_id << "_" << wcnt << "_" << tcnt
-                   << "'>";
+                   << "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
          for (auto conf : timestep) {
            hocr_str << "\n         <span class='ocrx_cinfo'"
                     << " id='"
-                     << "choice_" << page_id << "_" << wcnt << "_" << ccnt
-                     << "'"
+                     << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
                     << " title='x_confs " << int(conf.second * 100) << "'>"
                     << HOcrEscape(conf.first).c_str() << "</span>";
            ++ccnt;
@ -383,9 +368,8 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
        if (timestep.size() > 0) {
          hocr_str << "\n       <span class='ocrx_cinfo'"
                   << " id='"
-                   << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
-                   << "'>";
-          for (auto& j : timestep) {
+                   << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
+          for (auto &j : timestep) {
            float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
            if (conf < 0.0f)
              conf = 0.0f;
@ -393,10 +377,9 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
              conf = 100.0f;
            hocr_str << "\n        <span class='ocrx_cinfo'"
                     << " id='"
-                     << "choice_" << page_id << "_" << wcnt << "_" << ccnt
-                     << "'"
-                     << " title='x_confs " << conf << "'>"
-                     << HOcrEscape(j.first).c_str() << "</span>";
+                     << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
+                     << " title='x_confs " << conf << "'>" << HOcrEscape(j.first).c_str()
+                     << "</span>";
            ccnt++;
          }
          hocr_str << "</span>";
@ -420,7 +403,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
    if (last_word_in_para) {
      hocr_str << "\n    </p>\n";
      pcnt++;
-      para_is_ltr = true;  // back to default direction
+      para_is_ltr = true; // back to default direction
    }
    if (last_word_in_block) {
      hocr_str << "   </div>\n";
@ -429,8 +412,8 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
  }
  hocr_str << "  </div>\n";

-  const std::string& text = hocr_str.str();
-  char* result = new char[text.length() + 1];
+  const std::string &text = hocr_str.str();
+  char *result = new char[text.length() + 1];
  strcpy(result, text.c_str());
  return result;
 }
@ -438,12 +421,12 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
 /**********************************************************************
 * HOcr Text Renderer interface implementation
 **********************************************************************/
-TessHOcrRenderer::TessHOcrRenderer(const char* outputbase)
+TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
    : TessResultRenderer(outputbase, "hocr") {
  font_info_ = false;
 }

-TessHOcrRenderer::TessHOcrRenderer(const char* outputbase, bool font_info)
+TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
    : TessResultRenderer(outputbase, "hocr") {
  font_info_ = font_info;
 }
@ -460,11 +443,12 @@ bool TessHOcrRenderer::BeginDocumentHandler() {
      "</title>\n"
      "  <meta http-equiv=\"Content-Type\" content=\"text/html;"
      "charset=utf-8\"/>\n"
-      "  <meta name='ocr-system' content='tesseract " PACKAGE_VERSION
+      "  <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
      "' />\n"
      "  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
      " ocr_line ocrx_word ocrp_wconf");
-  if (font_info_) AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
+  if (font_info_)
+    AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
  AppendString(
      "'/>\n"
      " </head>\n"
@ -479,13 +463,14 @@ bool TessHOcrRenderer::EndDocumentHandler() {
  return true;
 }

-bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
+bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {
  const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
-  if (hocr == nullptr) return false;
+  if (hocr == nullptr)
+    return false;

  AppendString(hocr.get());

  return true;
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/api/lstmboxrenderer.cpp
+++ b/src/api/lstmboxrenderer.cpp
@ -16,9 +16,9 @@
 *
 **********************************************************************/

-#include <tesseract/baseapi.h>  // for TessBaseAPI
+#include <tesseract/baseapi.h> // for TessBaseAPI
 #include <tesseract/renderer.h>
-#include "tesseractclass.h"  // for Tesseract
+#include "tesseractclass.h" // for Tesseract

 namespace tesseract {

@ -27,23 +27,23 @@ namespace tesseract {
 * page_number is a 0-base page index that will appear in the box file.
 * Returned string must be freed with the delete [] operator.
 */
-static void AddBoxToLSTM(int right, int bottom, int top, int image_height,
-                         int page_num, STRING* text) {
-  text->add_str_int(" ", image_height - bottom);
-  text->add_str_int(" ", right + 5);
-  text->add_str_int(" ", image_height - top);
-  text->add_str_int(" ", page_num);
+static void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num,
+                         std::string &text) {
+  text += " " + std::to_string(image_height - bottom);
+  text += " " + std::to_string(right + 5);
+  text += " " + std::to_string(image_height - top);
+  text += " " + std::to_string(page_num);
 }

-char* TessBaseAPI::GetLSTMBoxText(int page_number=0) {
+char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {
  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
    return nullptr;

-  STRING lstm_box_str("");
+  std::string lstm_box_str;
  bool first_word = true;
  int left = 0, top = 0, right = 0, bottom = 0;

-  LTRResultIterator* res_it = GetLTRIterator();
+  LTRResultIterator *res_it = GetLTRIterator();
  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->Empty(RIL_SYMBOL)) {
      res_it->Next(RIL_SYMBOL);
@ -52,38 +52,35 @@ char* TessBaseAPI::GetLSTMBoxText(int page_number=0) {
    if (!first_word) {
      if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
        if (res_it->IsAtBeginningOf(RIL_WORD)) {
-          lstm_box_str.add_str_int("  ", left);
-          AddBoxToLSTM(right, bottom, top, image_height_, page_number,
-                       &lstm_box_str);
-          lstm_box_str += "\n";  // end of row for word
-        }                        // word
+          lstm_box_str += "  " + std::to_string(left);
+          AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+          lstm_box_str += "\n"; // end of row for word
+        }                       // word
      } else {
        if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
-          lstm_box_str.add_str_int("\t ", left);
-          AddBoxToLSTM(right, bottom, top, image_height_, page_number,
-                       &lstm_box_str);
-          lstm_box_str += "\n";  // end of row for line
-        }                        // line
+          lstm_box_str += "\t " + std::to_string(left);
+          AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+          lstm_box_str += "\n"; // end of row for line
+        }                       // line
      }
-    }  // not first word
+    } // not first word
    first_word = false;
    // Use bounding box for whole line for everything
    res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
    do {
-      lstm_box_str +=
-          std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
+      lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
-    lstm_box_str.add_str_int(" ", left);
-    AddBoxToLSTM(right, bottom, top, image_height_, page_number, &lstm_box_str);
-    lstm_box_str += "\n";  // end of row for symbol
+    lstm_box_str += " " + std::to_string(left);
+    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+    lstm_box_str += "\n"; // end of row for symbol
  }
-  if (!first_word) {  // if first_word is true  => empty page
-    lstm_box_str.add_str_int("\t ", left);
-    AddBoxToLSTM(right, bottom, top, image_height_, page_number, &lstm_box_str);
-    lstm_box_str += "\n";  // end of PAGE
+  if (!first_word) { // if first_word is true  => empty page
+    lstm_box_str += "\t " + std::to_string(left);
+    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+    lstm_box_str += "\n"; // end of PAGE
  }
-  char* ret = new char[lstm_box_str.length() + 1];
+  char *ret = new char[lstm_box_str.length() + 1];
  strcpy(ret, lstm_box_str.c_str());
  delete res_it;
  return ret;
@ -92,16 +89,17 @@ char* TessBaseAPI::GetLSTMBoxText(int page_number=0) {
 /**********************************************************************
 * LSTMBox Renderer interface implementation
 **********************************************************************/
-TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char* outputbase)
+TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase)
    : TessResultRenderer(outputbase, "box") {}

-bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI* api) {
+bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) {
  const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum()));
-  if (lstmbox == nullptr) return false;
+  if (lstmbox == nullptr)
+    return false;

  AppendString(lstmbox.get());

  return true;
 }

-}  // namespace tesseract.
+} // namespace tesseract.
--- a/src/api/pdf_ttf.h
+++ b/src/api/pdf_ttf.h
@ -0,0 +1,63 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pdf_ttf.h
+// Description: pdf.ttf (GlyphLessFont) replacement.
+//              Generated with: "bin2cpp pdf.ttf pdf_ttf cpp17"
+// Author:      Zdenko Podobny
+//
+// (C) Copyright 2020, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef pdf_ttf__H
+#define pdf_ttf__H
+
+#include <cstdint> // uint8_t
+
+static const uint8_t pdf_ttf[] = {
+    0x0,  0x1,  0x0,  0x0,  0x0,  0xa,  0x0,  0x80, 0x0,  0x3,  0x0,  0x20, 0x4f, 0x53, 0x2f, 0x32,
+    0x56, 0xde, 0xc8, 0x94, 0x0,  0x0,  0x1,  0x28, 0x0,  0x0,  0x0,  0x60, 0x63, 0x6d, 0x61, 0x70,
+    0x0,  0xa,  0x0,  0x34, 0x0,  0x0,  0x1,  0x90, 0x0,  0x0,  0x0,  0x1e, 0x67, 0x6c, 0x79, 0x66,
+    0x15, 0x22, 0x41, 0x24, 0x0,  0x0,  0x1,  0xb8, 0x0,  0x0,  0x0,  0x18, 0x68, 0x65, 0x61, 0x64,
+    0xb,  0x78, 0xf1, 0x65, 0x0,  0x0,  0x0,  0xac, 0x0,  0x0,  0x0,  0x36, 0x68, 0x68, 0x65, 0x61,
+    0xc,  0x2,  0x4,  0x2,  0x0,  0x0,  0x0,  0xe4, 0x0,  0x0,  0x0,  0x24, 0x68, 0x6d, 0x74, 0x78,
+    0x4,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x88, 0x0,  0x0,  0x0,  0x8,  0x6c, 0x6f, 0x63, 0x61,
+    0x0,  0xc,  0x0,  0x0,  0x0,  0x0,  0x1,  0xb0, 0x0,  0x0,  0x0,  0x6,  0x6d, 0x61, 0x78, 0x70,
+    0x0,  0x4,  0x0,  0x5,  0x0,  0x0,  0x1,  0x8,  0x0,  0x0,  0x0,  0x20, 0x6e, 0x61, 0x6d, 0x65,
+    0xf2, 0xeb, 0x16, 0xda, 0x0,  0x0,  0x1,  0xd0, 0x0,  0x0,  0x0,  0x4b, 0x70, 0x6f, 0x73, 0x74,
+    0x0,  0x1,  0x0,  0x1,  0x0,  0x0,  0x2,  0x1c, 0x0,  0x0,  0x0,  0x20, 0x0,  0x1,  0x0,  0x0,
+    0x0,  0x1,  0x0,  0x0,  0xb0, 0x94, 0x71, 0x10, 0x5f, 0xf,  0x3c, 0xf5, 0x4,  0x7,  0x8,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0xcf, 0x9a, 0xfc, 0x6e, 0x0,  0x0,  0x0,  0x0,  0xd4, 0xc3, 0xa7, 0xf2,
+    0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x8,  0x0,  0x0,  0x0,  0x0,  0x10, 0x0,  0x2,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x8,  0x0,  0xff, 0xff, 0x0,  0x0,  0x4,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x2,  0x0,  0x1,  0x0,  0x0,  0x0,  0x2,  0x0,  0x4,
+    0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x3,  0x0,  0x0,  0x1,  0x90, 0x0,  0x5,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x5,  0x0,  0x1,  0x0,  0x1,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x47, 0x4f, 0x4f, 0x47, 0x0,  0x40, 0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0xff, 0xff,
+    0x0,  0x0,  0x0,  0x1,  0x0,  0x1,  0x80, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x2,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x14, 0x0,  0x3,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x14, 0x0,  0x6,  0x0,  0xa,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0xc,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x4,  0x0,
+    0x8,  0x0,  0x0,  0x3,  0x0,  0x0,  0x31, 0x21, 0x11, 0x21, 0x4,  0x0,  0xfc, 0x0,  0x8,  0x0,
+    0x0,  0x0,  0x0,  0x3,  0x0,  0x2a, 0x0,  0x0,  0x0,  0x3,  0x0,  0x0,  0x0,  0x5,  0x0,  0x16,
+    0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x5,  0x0,  0xb,  0x0,  0x16, 0x0,  0x3,
+    0x0,  0x1,  0x4,  0x9,  0x0,  0x5,  0x0,  0x16, 0x0,  0x0,  0x0,  0x56, 0x0,  0x65, 0x0,  0x72,
+    0x0,  0x73, 0x0,  0x69, 0x0,  0x6f, 0x0,  0x6e, 0x0,  0x20, 0x0,  0x31, 0x0,  0x2e, 0x0,  0x30,
+    0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x30, 0x0,  0x0,  0x1,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0};
+
+#endif
--- a/src/api/pdfrenderer.cpp
+++ b/src/api/pdfrenderer.cpp
@ -17,19 +17,23 @@

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
-#include "config_auto.h"
+#  include "config_auto.h"
 #endif

-#include <locale>  // for std::locale::classic
-#include <memory>  // std::unique_ptr
-#include <sstream> // for std::stringstream
-#include "allheaders.h"
-#include <tesseract/baseapi.h>
-#include <cmath>
-#include <tesseract/renderer.h>
-#include <cstring>
+#include "pdf_ttf.h"
 #include "tprintf.h"

+#include <allheaders.h>
+#include <tesseract/baseapi.h>
+#include <tesseract/renderer.h>
+#include <cmath>
+#include <cstring>
+#include <fstream>   // for std::ifstream
+#include <locale>    // for std::locale::classic
+#include <memory>    // std::unique_ptr
+#include <sstream>   // for std::stringstream
+#include "helpers.h" // for Swap
+
 /*

 Design notes from Ken Sharp, with light editing.
@ -176,11 +180,9 @@ static const int kMaxBytesPerCodepoint = 20;
 /**********************************************************************
 * PDF Renderer interface implementation
 **********************************************************************/
-TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir,
-                                 bool textonly)
-    : TessResultRenderer(outputbase, "pdf"),
-      datadir_(datadir) {
-  obj_  = 0;
+TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly)
+    : TessResultRenderer(outputbase, "pdf"), datadir_(datadir) {
+  obj_ = 0;
  textonly_ = textonly;
  offsets_.push_back(0);
 }
@ -218,13 +220,12 @@ static long dist2(int x1, int y1, int x2, int y2) {
 // left-to-right no matter what the reading order is. We need the
 // word baseline in reading order, so we do that conversion here. Returns
 // the word's baseline origin and length.
-static void GetWordBaseline(int writing_direction, int ppi, int height,
-                            int word_x1, int word_y1, int word_x2, int word_y2,
-                            int line_x1, int line_y1, int line_x2, int line_y2,
-                            double *x0, double *y0, double *length) {
+static void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1,
+                            int word_x2, int word_y2, int line_x1, int line_y1, int line_x2,
+                            int line_y2, double *x0, double *y0, double *length) {
  if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
-    Swap(&word_x1, &word_x2);
-    Swap(&word_y1, &word_y2);
+    std::swap(word_x1, word_x2);
+    std::swap(word_y1, word_y2);
  }
  double word_length;
  double x, y;
@ -236,13 +237,11 @@ static void GetWordBaseline(int writing_direction, int ppi, int height,
      x = line_x1;
      y = line_y1;
    } else {
-      double t = ((px - line_x2) * (line_x2 - line_x1) +
-                  (py - line_y2) * (line_y2 - line_y1)) / l2;
+      double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2;
      x = line_x2 + t * (line_x2 - line_x1);
      y = line_y2 + t * (line_y2 - line_y1);
    }
-    word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
-                                                 word_x2, word_y2)));
+    word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2)));
    word_length = word_length * 72.0 / ppi;
    x = x * 72 / ppi;
    y = height - (y * 72.0 / ppi);
@ -260,16 +259,15 @@ static void GetWordBaseline(int writing_direction, int ppi, int height,
 //                           RTL
 // [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
 // [ y' ]   [ c d ][ y ]   [ 0 1 ] [-sin cos ][ y ]
-static void AffineMatrix(int writing_direction,
-                         int line_x1, int line_y1, int line_x2, int line_y2,
+static void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2,
                         double *a, double *b, double *c, double *d) {
-  double theta = atan2(static_cast<double>(line_y1 - line_y2),
-                       static_cast<double>(line_x2 - line_x1));
+  double theta =
+      atan2(static_cast<double>(line_y1 - line_y2), static_cast<double>(line_x2 - line_x1));
  *a = cos(theta);
  *b = sin(theta);
  *c = -sin(theta);
  *d = cos(theta);
-  switch(writing_direction) {
+  switch (writing_direction) {
    case WRITING_DIRECTION_RIGHT_TO_LEFT:
      *a = -*a;
      *b = -*b;
@ -289,8 +287,7 @@ static void AffineMatrix(int writing_direction,
 // these viewers. I chose this threshold large enough to absorb noise,
 // but small enough that lines probably won't cross each other if the
 // whole page is tilted at almost exactly the clipping threshold.
-static void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
-                         int *line_x1, int *line_y1,
+static void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1,
                         int *line_x2, int *line_y2) {
  *line_x1 = x1;
  *line_y1 = y1;
@ -313,21 +310,18 @@ static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
    int a = code - 0x010000;
    int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
    int low_surrogate = (0x03FF & a) + 0xDC00;
-    snprintf(utf16, kMaxBytesPerCodepoint,
-             "%04X%04X", high_surrogate, low_surrogate);
+    snprintf(utf16, kMaxBytesPerCodepoint, "%04X%04X", high_surrogate, low_surrogate);
  }
  return true;
 }

-char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
-                                         double width, double height) {
+char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) {
  double ppi = api->GetSourceYResolution();

  // These initial conditions are all arbitrary and will be overwritten
  double old_x = 0.0, old_y = 0.0;
  int old_fontsize = 0;
-  tesseract::WritingDirection old_writing_direction =
-      WRITING_DIRECTION_LEFT_TO_RIGHT;
+  tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
  bool new_block = true;
  int fontsize = 0;
  double a = 1;
@ -358,9 +352,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
  ResultIterator *res_it = api->GetIterator();
  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
-      pdf_str << "BT\n3 Tr";     // Begin text object, use invisible ink
-      old_fontsize = 0;          // Every block will declare its fontsize
-      new_block = true;          // Every block will declare its affine matrix
+      pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink
+      old_fontsize = 0;      // Every block will declare its fontsize
+      new_block = true;      // Every block will declare its affine matrix
    }

    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
@ -380,8 +374,7 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
      tesseract::Orientation orientation;
      tesseract::TextlineOrder textline_order;
      float deskew_angle;
-      res_it->Orientation(&orientation, &writing_direction,
-                          &textline_order, &deskew_angle);
+      res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
      if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
        switch (res_it->WordDirection()) {
          case DIR_LEFT_TO_RIGHT:
@ -401,15 +394,12 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
    {
      int word_x1, word_y1, word_x2, word_y2;
      res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
-      GetWordBaseline(writing_direction, ppi, height,
-                      word_x1, word_y1, word_x2, word_y2,
-                      line_x1, line_y1, line_x2, line_y2,
-                      &x, &y, &word_length);
+      GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1,
+                      line_y1, line_x2, line_y2, &x, &y, &word_length);
    }

    if (writing_direction != old_writing_direction || new_block) {
-      AffineMatrix(writing_direction,
-                   line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
+      AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
      pdf_str << " " << prec(a) // . This affine matrix
              << " " << prec(b) // . sets the coordinate
              << " " << prec(c) // . system for all
@ -421,9 +411,8 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
    } else {
      double dx = x - old_x;
      double dy = y - old_y;
-      pdf_str << " " << prec(dx * a + dy * b)
-              << " " << prec(dx * c + dy * d)
-              << (" Td ");      // Relative moveto
+      pdf_str << " " << prec(dx * a + dy * b) << " " << prec(dx * c + dy * d)
+              << (" Td "); // Relative moveto
    }
    old_x = x;
    old_y = y;
@ -436,8 +425,8 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
    {
      bool bold, italic, underlined, monospace, serif, smallcaps;
      int font_id;
-      res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
-                                 &serif, &smallcaps, &fontsize, &font_id);
+      res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,
+                                 &fontsize, &font_id);
      const int kDefaultFontsize = 8;
      if (fontsize <= 0)
        fontsize = kDefaultFontsize;
@ -452,8 +441,7 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
    std::string pdf_word;
    int pdf_word_len = 0;
    do {
-      const std::unique_ptr<const char[]> grapheme(
-          res_it->GetUTF8Text(RIL_SYMBOL));
+      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
      if (grapheme && grapheme[0] != '\0') {
        std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());
        char utf16[kMaxBytesPerCodepoint];
@ -471,21 +459,20 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
      pdf_word_len++;
    }
    if (word_length > 0 && pdf_word_len > 0) {
-      double h_stretch =
-          kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
-      pdf_str << h_stretch << " Tz"     // horizontal stretch
-              << " [ <" << pdf_word     // UTF-16BE representation
-              << "> ] TJ";              // show the text
+      double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
+      pdf_str << h_stretch << " Tz" // horizontal stretch
+              << " [ <" << pdf_word // UTF-16BE representation
+              << "> ] TJ";          // show the text
    }
    if (last_word_in_line) {
      pdf_str << " \n";
    }
    if (last_word_in_block) {
-      pdf_str << "ET\n";         // end the text object
+      pdf_str << "ET\n"; // end the text object
    }
  }
-  const std::string& text = pdf_str.str();
-  char* result = new char[text.length() + 1];
+  const std::string &text = pdf_str.str();
+  char *result = new char[text.length() + 1];
  strcpy(result, text.c_str());
  delete res_it;
  return result;
@ -495,11 +482,12 @@ bool TessPDFRenderer::BeginDocumentHandler() {
  AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");

  // CATALOG
-  AppendPDFObject("1 0 obj\n"
-                  "<<\n"
-                  "  /Type /Catalog\n"
-                  "  /Pages 2 0 R\n"
-                  ">>\nendobj\n");
+  AppendPDFObject(
+      "1 0 obj\n"
+      "<<\n"
+      "  /Type /Catalog\n"
+      "  /Pages 2 0 R\n"
+      ">>\nendobj\n");

  // We are reserving object #2 for the /Pages
  // object, which I am going to create and write
@ -507,56 +495,58 @@ bool TessPDFRenderer::BeginDocumentHandler() {
  AppendPDFObject("");

  // TYPE0 FONT
-  AppendPDFObject("3 0 obj\n"
-                  "<<\n"
-                  "  /BaseFont /GlyphLessFont\n"
-                  "  /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
-                  "  /Encoding /Identity-H\n"
-                  "  /Subtype /Type0\n"
-                  "  /ToUnicode 6 0 R\n" // ToUnicode
-                  "  /Type /Font\n"
-                  ">>\n"
-                  "endobj\n");
+  AppendPDFObject(
+      "3 0 obj\n"
+      "<<\n"
+      "  /BaseFont /GlyphLessFont\n"
+      "  /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
+      "  /Encoding /Identity-H\n"
+      "  /Subtype /Type0\n"
+      "  /ToUnicode 6 0 R\n" // ToUnicode
+      "  /Type /Font\n"
+      ">>\n"
+      "endobj\n");

  // CIDFONTTYPE2
  std::stringstream stream;
  // Use "C" locale (needed for int values larger than 999).
  stream.imbue(std::locale::classic());
-  stream <<
-    "4 0 obj\n"
-    "<<\n"
-    "  /BaseFont /GlyphLessFont\n"
-    "  /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
-    "  /CIDSystemInfo\n"
-    "  <<\n"
-    "     /Ordering (Identity)\n"
-    "     /Registry (Adobe)\n"
-    "     /Supplement 0\n"
-    "  >>\n"
-    "  /FontDescriptor 7 0 R\n" // Font descriptor
-    "  /Subtype /CIDFontType2\n"
-    "  /Type /Font\n"
-    "  /DW " << (1000 / kCharWidth) << "\n"
-    ">>\n"
-    "endobj\n";
+  stream << "4 0 obj\n"
+            "<<\n"
+            "  /BaseFont /GlyphLessFont\n"
+            "  /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
+            "  /CIDSystemInfo\n"
+            "  <<\n"
+            "     /Ordering (Identity)\n"
+            "     /Registry (Adobe)\n"
+            "     /Supplement 0\n"
+            "  >>\n"
+            "  /FontDescriptor 7 0 R\n" // Font descriptor
+            "  /Subtype /CIDFontType2\n"
+            "  /Type /Font\n"
+            "  /DW "
+         << (1000 / kCharWidth)
+         << "\n"
+            ">>\n"
+            "endobj\n";
  AppendPDFObject(stream.str().c_str());

  // CIDTOGIDMAP
  const int kCIDToGIDMapSize = 2 * (1 << 16);
-  const std::unique_ptr<unsigned char[]> cidtogidmap(
-      new unsigned char[kCIDToGIDMapSize]);
+  const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
  for (int i = 0; i < kCIDToGIDMapSize; i++) {
    cidtogidmap[i] = (i % 2) ? 1 : 0;
  }
  size_t len;
  unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
  stream.str("");
-  stream <<
-    "5 0 obj\n"
-    "<<\n"
-    "  /Length " << len << " /Filter /FlateDecode\n"
-    ">>\n"
-    "stream\n";
+  stream << "5 0 obj\n"
+            "<<\n"
+            "  /Length "
+         << len
+         << " /Filter /FlateDecode\n"
+            ">>\n"
+            "stream\n";
  AppendString(stream.str().c_str());
  long objsize = stream.str().size();
  AppendData(reinterpret_cast<char *>(comp), len);
@ -594,65 +584,67 @@ bool TessPDFRenderer::BeginDocumentHandler() {

  // TOUNICODE
  stream.str("");
-  stream <<
-    "6 0 obj\n"
-    "<< /Length " << (sizeof(stream2) - 1) << " >>\n"
-    "stream\n" << stream2 <<
-    "endstream\n"
-    "endobj\n";
+  stream << "6 0 obj\n"
+            "<< /Length "
+         << (sizeof(stream2) - 1)
+         << " >>\n"
+            "stream\n"
+         << stream2
+         << "endstream\n"
+            "endobj\n";
  AppendPDFObject(stream.str().c_str());

  // FONT DESCRIPTOR
  stream.str("");
-  stream <<
-    "7 0 obj\n"
-    "<<\n"
-    "  /Ascent 1000\n"
-    "  /CapHeight 1000\n"
-    "  /Descent -1\n"       // Spec says must be negative
-    "  /Flags 5\n"          // FixedPitch + Symbolic
-    "  /FontBBox  [ 0 0 " << (1000 / kCharWidth) << " 1000 ]\n"
-    "  /FontFile2 8 0 R\n"
-    "  /FontName /GlyphLessFont\n"
-    "  /ItalicAngle 0\n"
-    "  /StemV 80\n"
-    "  /Type /FontDescriptor\n"
-    ">>\n"
-    "endobj\n";
+  stream << "7 0 obj\n"
+            "<<\n"
+            "  /Ascent 1000\n"
+            "  /CapHeight 1000\n"
+            "  /Descent -1\n" // Spec says must be negative
+            "  /Flags 5\n"    // FixedPitch + Symbolic
+            "  /FontBBox  [ 0 0 "
+         << (1000 / kCharWidth)
+         << " 1000 ]\n"
+            "  /FontFile2 8 0 R\n"
+            "  /FontName /GlyphLessFont\n"
+            "  /ItalicAngle 0\n"
+            "  /StemV 80\n"
+            "  /Type /FontDescriptor\n"
+            ">>\n"
+            "endobj\n";
  AppendPDFObject(stream.str().c_str());

  stream.str("");
  stream << datadir_.c_str() << "/pdf.ttf";
-  FILE *fp = fopen(stream.str().c_str(), "rb");
-  if (!fp) {
-    tprintf("Cannot open file \"%s\"!\n", stream.str().c_str());
-    return false;
+  const uint8_t *font;
+  std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
+  std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
+  auto size = buffer.size();
+  if (size) {
+    font = buffer.data();
+  } else {
+#if !defined(NDEBUG)
+    tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
+#endif
+    font = pdf_ttf;
+    size = sizeof(pdf_ttf);
  }
-  fseek(fp, 0, SEEK_END);
-  auto size = std::ftell(fp);
-  if (size < 0) {
-    fclose(fp);
-    return false;
-  }
-  fseek(fp, 0, SEEK_SET);
-  const std::unique_ptr<char[]> buffer(new char[size]);
-  if (!tesseract::DeSerialize(fp, buffer.get(), size)) {
-    fclose(fp);
-    return false;
-  }
-  fclose(fp);
+
  // FONTFILE2
  stream.str("");
-  stream <<
-    "8 0 obj\n"
-    "<<\n"
-    "  /Length " << size << "\n"
-    "  /Length1 " << size << "\n"
-    ">>\n"
-    "stream\n";
+  stream << "8 0 obj\n"
+            "<<\n"
+            "  /Length "
+         << size
+         << "\n"
+            "  /Length1 "
+         << size
+         << "\n"
+            ">>\n"
+            "stream\n";
  AppendString(stream.str().c_str());
-  objsize  = stream.str().size();
-  AppendData(buffer.get(), size);
+  objsize = stream.str().size();
+  AppendData(reinterpret_cast<const char *>(font), size);
  objsize += size;
  AppendString(endstream_endobj);
  objsize += strlen(endstream_endobj);
@ -660,11 +652,8 @@ bool TessPDFRenderer::BeginDocumentHandler() {
  return true;
 }

-bool TessPDFRenderer::imageToPDFObj(Pix *pix,
-                                    const char* filename,
-                                    long int objnum,
-                                    char **pdf_object,
-                                    long int* pdf_object_size,
+bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int objnum,
+                                    char **pdf_object, long int *pdf_object_size,
                                    const int jpg_quality) {
  if (!pdf_object_size || !pdf_object)
    return false;
@ -689,7 +678,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,

  const char *group4 = "";
  const char *filter;
-  switch(cid->type) {
+  switch (cid->type) {
    case L_FLATE_ENCODE:
      filter = "/FlateDecode";
      break;
@ -715,15 +704,15 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
  // Use "C" locale (needed for int values larger than 999).
  colorspace.imbue(std::locale::classic());
  if (cid->ncolors > 0) {
-    colorspace
-      << "  /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1)
-      << " " << cid->cmapdatahex << " ]\n";
+    colorspace << "  /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1) << " "
+               << cid->cmapdatahex << " ]\n";
  } else {
    switch (cid->spp) {
      case 1:
        if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) {
-          colorspace.str("  /ColorSpace /DeviceGray\n"
-                         "  /Decode [1 0]\n");
+          colorspace.str(
+              "  /ColorSpace /DeviceGray\n"
+              "  /Decode [1 0]\n");
        } else {
          colorspace.str("  /ColorSpace /DeviceGray\n");
        }
@ -743,29 +732,43 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
  std::stringstream b1;
  // Use "C" locale (needed for int values larger than 999).
  b1.imbue(std::locale::classic());
-  b1 <<
-    objnum << " 0 obj\n"
-    "<<\n"
-    "  /Length " << cid->nbytescomp << "\n"
-    "  /Subtype /Image\n";
+  b1 << objnum
+     << " 0 obj\n"
+        "<<\n"
+        "  /Length "
+     << cid->nbytescomp
+     << "\n"
+        "  /Subtype /Image\n";

  std::stringstream b2;
  // Use "C" locale (needed for int values larger than 999).
  b2.imbue(std::locale::classic());
-  b2 <<
-    "  /Width " << cid->w << "\n"
-    "  /Height " << cid->h << "\n"
-    "  /BitsPerComponent " << cid->bps << "\n"
-    "  /Filter " << filter << "\n"
-    "  /DecodeParms\n"
-    "  <<\n"
-    "    /Predictor " << predictor << "\n"
-    "    /Colors " << cid->spp << "\n" << group4 <<
-    "    /Columns " << cid->w << "\n"
-    "    /BitsPerComponent " << cid->bps << "\n"
-    "  >>\n"
-    ">>\n"
-    "stream\n";
+  b2 << "  /Width " << cid->w
+     << "\n"
+        "  /Height "
+     << cid->h
+     << "\n"
+        "  /BitsPerComponent "
+     << cid->bps
+     << "\n"
+        "  /Filter "
+     << filter
+     << "\n"
+        "  /DecodeParms\n"
+        "  <<\n"
+        "    /Predictor "
+     << predictor
+     << "\n"
+        "    /Colors "
+     << cid->spp << "\n"
+     << group4 << "    /Columns " << cid->w
+     << "\n"
+        "    /BitsPerComponent "
+     << cid->bps
+     << "\n"
+        "  >>\n"
+        ">>\n"
+        "stream\n";

  const char *b3 =
      "endstream\n"
@ -776,8 +779,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
  size_t b3_len = strlen(b3);
  size_t colorspace_len = colorspace.str().size();

-  *pdf_object_size =
-      b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
+  *pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
  *pdf_object = new char[*pdf_object_size];

  char *p = *pdf_object;
@ -794,9 +796,9 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
  return true;
 }

-bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
+bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) {
  Pix *pix = api->GetInputImage();
-  const char* filename = api->GetInputName();
+  const char *filename = api->GetInputName();
  int ppi = api->GetSourceYResolution();
  if (!pix || ppi <= 0)
    return false;
@ -815,21 +817,26 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
  // Use "C" locale (needed for double values width and height).
  stream.imbue(std::locale::classic());
  stream.precision(2);
-  stream << std::fixed <<
-    obj_ << " 0 obj\n"
-    "<<\n"
-    "  /Type /Page\n"
-    "  /Parent 2 0 R\n" // Pages object
-    "  /MediaBox [0 0 " << width << " " << height << "]\n"
-    "  /Contents " << (obj_ + 1) << " 0 R\n" // Contents object
-    "  /Resources\n"
-    "  <<\n"
-    "    " << xobject.str() << // Image object
-    "    /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
-    "    /Font << /f-0-0 3 0 R >>\n" // Type0 Font
-    "  >>\n"
-    ">>\n"
-    "endobj\n";
+  stream << std::fixed << obj_
+         << " 0 obj\n"
+            "<<\n"
+            "  /Type /Page\n"
+            "  /Parent 2 0 R\n" // Pages object
+            "  /MediaBox [0 0 "
+         << width << " " << height
+         << "]\n"
+            "  /Contents "
+         << (obj_ + 1)
+         << " 0 R\n" // Contents object
+            "  /Resources\n"
+            "  <<\n"
+            "    "
+         << xobject.str() << // Image object
+      "    /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
+      "    /Font << /f-0-0 3 0 R >>\n" // Type0 Font
+      "  >>\n"
+      ">>\n"
+      "endobj\n";
  pages_.push_back(obj_);
  AppendPDFObject(stream.str().c_str());

@ -837,16 +844,18 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
  const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
  const size_t pdftext_len = strlen(pdftext.get());
  size_t len;
-  unsigned char *comp_pdftext = zlibCompress(
-      reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
+  unsigned char *comp_pdftext =
+      zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
  long comp_pdftext_len = len;
  stream.str("");
-  stream <<
-    obj_ << " 0 obj\n"
-    "<<\n"
-    "  /Length " << comp_pdftext_len << " /Filter /FlateDecode\n"
-    ">>\n"
-    "stream\n";
+  stream << obj_
+         << " 0 obj\n"
+            "<<\n"
+            "  /Length "
+         << comp_pdftext_len
+         << " /Filter /FlateDecode\n"
+            ">>\n"
+            "stream\n";
  AppendString(stream.str().c_str());
  long objsize = stream.str().size();
  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
@ -863,8 +872,7 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
    char *pdf_object = nullptr;
    int jpg_quality;
    api->GetIntVariable("jpg_quality", &jpg_quality);
-    if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize,
-                       jpg_quality)) {
+    if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
      return false;
    }
    AppendData(pdf_object, objsize);
@ -874,7 +882,6 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
  return true;
 }

-
 bool TessPDFRenderer::EndDocumentHandler() {
  // We reserved the /Pages object number early, so that the /Page
  // objects could refer to their parent. We finally have enough
@ -884,16 +891,16 @@ bool TessPDFRenderer::EndDocumentHandler() {

  // PAGES
  const long int kPagesObjectNumber = 2;
-  offsets_[kPagesObjectNumber] = offsets_.back();  // manipulation #1
+  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
  std::stringstream stream;
  // Use "C" locale (needed for int values larger than 999).
  stream.imbue(std::locale::classic());
  stream << kPagesObjectNumber << " 0 obj\n<<\n  /Type /Pages\n  /Kids [ ";
  AppendString(stream.str().c_str());
-  size_t pages_objsize  = stream.str().size();
-  for (size_t i = 0; i < pages_.unsigned_size(); i++) {
+  size_t pages_objsize = stream.str().size();
+  for (const auto &page : pages_) {
    stream.str("");
-    stream << pages_[i] << " 0 R ";
+    stream << page << " 0 R ";
    AppendString(stream.str().c_str());
    pages_objsize += stream.str().size();
  }
@ -901,10 +908,10 @@ bool TessPDFRenderer::EndDocumentHandler() {
  stream << "]\n  /Count " << pages_.size() << "\n>>\nendobj\n";
  AppendString(stream.str().c_str());
  pages_objsize += stream.str().size();
-  offsets_.back() += pages_objsize;    // manipulation #2
+  offsets_.back() += pages_objsize; // manipulation #2

  // INFO
-  STRING utf16_title = "FEFF";  // byte_order_marker
+  std::string utf16_title = "FEFF"; // byte_order_marker
  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
  char utf16[kMaxBytesPerCodepoint];
  for (char32 code : unicodes) {
@ -913,16 +920,22 @@ bool TessPDFRenderer::EndDocumentHandler() {
    }
  }

-  char* datestr = l_getFormattedDate();
+  char *datestr = l_getFormattedDate();
  stream.str("");
-  stream
-    << obj_ << " 0 obj\n"
-       "<<\n"
-       "  /Producer (Tesseract " << tesseract::TessBaseAPI::Version() << ")\n"
-       "  /CreationDate (D:" << datestr << ")\n"
-       "  /Title <" << utf16_title.c_str() << ">\n"
-       ">>\n"
-       "endobj\n";
+  stream << obj_
+         << " 0 obj\n"
+            "<<\n"
+            "  /Producer (Tesseract "
+         << tesseract::TessBaseAPI::Version()
+         << ")\n"
+            "  /CreationDate (D:"
+         << datestr
+         << ")\n"
+            "  /Title <"
+         << utf16_title.c_str()
+         << ">\n"
+            ">>\n"
+            "endobj\n";
  lept_free(datestr);
  AppendPDFObject(stream.str().c_str());
  stream.str("");
@ -936,12 +949,15 @@ bool TessPDFRenderer::EndDocumentHandler() {
    AppendString(stream.str().c_str());
  }
  stream.str("");
-  stream
-    << "trailer\n<<\n  /Size " << obj_ << "\n"
-    "  /Root 1 0 R\n" // catalog
-    "  /Info " << (obj_ - 1) << " 0 R\n" // info
-    ">>\nstartxref\n" << offsets_.back() << "\n%%EOF\n";
+  stream << "trailer\n<<\n  /Size " << obj_
+         << "\n"
+            "  /Root 1 0 R\n" // catalog
+            "  /Info "
+         << (obj_ - 1)
+         << " 0 R\n" // info
+            ">>\nstartxref\n"
+         << offsets_.back() << "\n%%EOF\n";
  AppendString(stream.str().c_str());
  return true;
 }
-}  // namespace tesseract
+} // namespace tesseract
--- a/src/api/renderer.cpp
+++ b/src/api/renderer.cpp
@ -16,29 +16,29 @@
 ///////////////////////////////////////////////////////////////////////

 #ifdef HAVE_CONFIG_H
-#include "config_auto.h"
+#  include "config_auto.h"
 #endif
-
-#include <cstring>
-#include <memory>  // std::unique_ptr
 #include <tesseract/baseapi.h>
-#include <tesseract/genericvector.h>
 #include <tesseract/renderer.h>
+#include <cstring>
+#include <memory>     // std::unique_ptr
+#include <string>     // std::string
+#include "serialis.h" // Serialize

 namespace tesseract {

 /**********************************************************************
 * Base Renderer interface implementation
 **********************************************************************/
-TessResultRenderer::TessResultRenderer(const char *outputbase,
-                                       const char* extension)
-    : file_extension_(extension),
-      title_(""), imagenum_(-1),
-      fout_(stdout),
-      next_(nullptr),
-      happy_(true) {
+TessResultRenderer::TessResultRenderer(const char *outputbase, const char *extension)
+    : file_extension_(extension)
+    , title_("")
+    , imagenum_(-1)
+    , fout_(stdout)
+    , next_(nullptr)
+    , happy_(true) {
  if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
-    STRING outfile = STRING(outputbase) + STRING(".") + STRING(file_extension_);
+    std::string outfile = std::string(outputbase) + "." + extension;
    fout_ = fopen(outfile.c_str(), "wb");
    if (fout_ == nullptr) {
      happy_ = false;
@ -56,10 +56,11 @@ TessResultRenderer::~TessResultRenderer() {
  delete next_;
 }

-void TessResultRenderer::insert(TessResultRenderer* next) {
-  if (next == nullptr) return;
+void TessResultRenderer::insert(TessResultRenderer *next) {
+  if (next == nullptr)
+    return;

-  TessResultRenderer* remainder = next_;
+  TessResultRenderer *remainder = next_;
  next_ = next;
  if (remainder) {
    while (next->next_ != nullptr) {
@ -69,8 +70,9 @@ void TessResultRenderer::insert(TessResultRenderer* next) {
  }
 }

-bool TessResultRenderer::BeginDocument(const char* title) {
-  if (!happy_) return false;
+bool TessResultRenderer::BeginDocument(const char *title) {
+  if (!happy_)
+    return false;
  title_ = title;
  imagenum_ = -1;
  bool ok = BeginDocumentHandler();
@ -80,8 +82,9 @@ bool TessResultRenderer::BeginDocument(const char* title) {
  return ok;
 }

-bool TessResultRenderer::AddImage(TessBaseAPI* api) {
-  if (!happy_) return false;
+bool TessResultRenderer::AddImage(TessBaseAPI *api) {
+  if (!happy_)
+    return false;
  ++imagenum_;
  bool ok = AddImageHandler(api);
  if (next_) {
@ -91,7 +94,8 @@ bool TessResultRenderer::AddImage(TessBaseAPI* api) {
 }

 bool TessResultRenderer::EndDocument() {
-  if (!happy_) return false;
+  if (!happy_)
+    return false;
  bool ok = EndDocumentHandler();
  if (next_) {
    ok = next_->EndDocument() && ok;
@ -99,12 +103,13 @@ bool TessResultRenderer::EndDocument() {
  return ok;
 }

-void TessResultRenderer::AppendString(const char* s) {
+void TessResultRenderer::AppendString(const char *s) {
  AppendData(s, strlen(s));
 }

-void TessResultRenderer::AppendData(const char* s, int len) {
-  if (!tesseract::Serialize(fout_, s, len)) happy_ = false;
+void TessResultRenderer::AppendData(const char *s, int len) {
+  if (!tesseract::Serialize(fout_, s, len))
+    happy_ = false;
  fflush(fout_);
 }

@ -116,15 +121,13 @@ bool TessResultRenderer::EndDocumentHandler() {
  return happy_;
 }

-
 /**********************************************************************
 * UTF8 Text Renderer interface implementation
 **********************************************************************/
 TessTextRenderer::TessTextRenderer(const char *outputbase)
-    : TessResultRenderer(outputbase, "txt") {
-}
+    : TessResultRenderer(outputbase, "txt") {}

-bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {
+bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
  const std::unique_ptr<const char[]> utf8(api->GetUTF8Text());
  if (utf8 == nullptr) {
    return false;
@ -132,7 +135,7 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {

  AppendString(utf8.get());

-  const char* pageSeparator = api->GetStringVariable("page_separator");
+  const char *pageSeparator = api->GetStringVariable("page_separator");
  if (pageSeparator != nullptr && *pageSeparator != '\0') {
    AppendString(pageSeparator);
  }
@ -143,12 +146,11 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {
 /**********************************************************************
 * TSV Text Renderer interface implementation
 **********************************************************************/
-TessTsvRenderer::TessTsvRenderer(const char* outputbase)
-    : TessResultRenderer(outputbase, "tsv") {
+TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
  font_info_ = false;
 }

-TessTsvRenderer::TessTsvRenderer(const char* outputbase, bool font_info)
+TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
    : TessResultRenderer(outputbase, "tsv") {
  font_info_ = font_info;
 }
@ -161,11 +163,14 @@ bool TessTsvRenderer::BeginDocumentHandler() {
  return true;
 }

-bool TessTsvRenderer::EndDocumentHandler() { return true; }
+bool TessTsvRenderer::EndDocumentHandler() {
+  return true;
+}

-bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) {
+bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
  const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
-  if (tsv == nullptr) return false;
+  if (tsv == nullptr)
+    return false;

  AppendString(tsv.get());

@ -176,12 +181,12 @@ bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) {
 * UNLV Text Renderer interface implementation
 **********************************************************************/
 TessUnlvRenderer::TessUnlvRenderer(const char *outputbase)
-    : TessResultRenderer(outputbase, "unlv") {
-}
+    : TessResultRenderer(outputbase, "unlv") {}

-bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) {
+bool TessUnlvRenderer::AddImageHandler(TessBaseAPI *api) {
  const std::unique_ptr<const char[]> unlv(api->GetUNLVText());
-  if (unlv == nullptr) return false;
+  if (unlv == nullptr)
+    return false;

  AppendString(unlv.get());

@ -192,12 +197,12 @@ bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) {
 * BoxText Renderer interface implementation
 **********************************************************************/
 TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase)
-    : TessResultRenderer(outputbase, "box") {
-}
+    : TessResultRenderer(outputbase, "box") {}

-bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) {
+bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI *api) {
  const std::unique_ptr<const char[]> text(api->GetBoxText(imagenum()));
-  if (text == nullptr) return false;
+  if (text == nullptr)
+    return false;

  AppendString(text.get());

@ -209,12 +214,12 @@ bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) {
 /**********************************************************************
 * Osd Text Renderer interface implementation
 **********************************************************************/
-TessOsdRenderer::TessOsdRenderer(const char* outputbase)
-    : TessResultRenderer(outputbase, "osd") {}
+TessOsdRenderer::TessOsdRenderer(const char *outputbase) : TessResultRenderer(outputbase, "osd") {}

-bool TessOsdRenderer::AddImageHandler(TessBaseAPI* api) {
-  char* osd = api->GetOsdText(imagenum());
-  if (osd == nullptr) return false;
+bool TessOsdRenderer::AddImageHandler(TessBaseAPI *api) {
+  char *osd = api->GetOsdText(imagenum());
+  if (osd == nullptr)
+    return false;

  AppendString(osd);
  delete[] osd;
@ -224,4 +229,4 @@ bool TessOsdRenderer::AddImageHandler(TessBaseAPI* api) {

 #endif // ndef DISABLED_LEGACY_ENGINE

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/api/tesseractmain.cpp
+++ b/src/api/tesseractmain.cpp
@ -18,42 +18,44 @@

 // Include automatically generated configuration file if running autoconf
 #ifdef HAVE_CONFIG_H
-#include "config_auto.h"
+#  include "config_auto.h"
 #endif

-#include <cerrno>               // for errno
+#include <cerrno> // for errno
+#if defined(__USE_GNU)
+#  include <cfenv> // for feenableexcept
+#endif
 #include <iostream>

-#include "allheaders.h"
+#include <allheaders.h>
 #include <tesseract/baseapi.h>
 #include "dict.h"
 #if defined(USE_OPENCL)
-#include "openclwrapper.h"      // for OpenclDevice
+#  include "openclwrapper.h" // for OpenclDevice
 #endif
 #include <tesseract/renderer.h>
 #include "simddetect.h"
-#include "tprintf.h"            // for tprintf
+#include "tprintf.h" // for tprintf

 #ifdef _OPENMP
-#include <omp.h>
+#  include <omp.h>
 #endif

 #if defined(HAVE_LIBARCHIVE)
-#include <archive.h>
+#  include <archive.h>
 #endif
 #if defined(HAVE_LIBCURL)
-#include <curl/curl.h>
+#  include <curl/curl.h>
 #endif

 #if defined(_WIN32)
-#include <fcntl.h>
-#include <io.h>
-#if defined(HAVE_TIFFIO_H)
+#  include <fcntl.h>
+#  include <io.h>
+#  if defined(HAVE_TIFFIO_H)

-#include <tiffio.h>
+#    include <tiffio.h>

-static void Win32ErrorHandler(const char* module, const char* fmt,
-                              va_list ap) {
+static void Win32ErrorHandler(const char *module, const char *fmt, va_list ap) {
  if (module != nullptr) {
    fprintf(stderr, "%s: ", module);
  }
@ -61,8 +63,7 @@ static void Win32ErrorHandler(const char* module, const char* fmt,
  fprintf(stderr, ".\n");
 }

-static void Win32WarningHandler(const char* module, const char* fmt,
-                                va_list ap) {
+static void Win32WarningHandler(const char *module, const char *fmt, va_list ap) {
  if (module != nullptr) {
    fprintf(stderr, "%s: ", module);
  }
@ -71,10 +72,10 @@ static void Win32WarningHandler(const char* module, const char* fmt,
  fprintf(stderr, ".\n");
 }

-#endif /* HAVE_TIFFIO_H */
+#  endif /* HAVE_TIFFIO_H */

 class AutoWin32ConsoleOutputCP {
- public:
+public:
  explicit AutoWin32ConsoleOutputCP(UINT codeCP) {
    oldCP_ = GetConsoleOutputCP();
    SetConsoleOutputCP(codeCP);
@ -82,16 +83,19 @@ class AutoWin32ConsoleOutputCP {
  ~AutoWin32ConsoleOutputCP() {
    SetConsoleOutputCP(oldCP_);
  }
- private:  
+
+private:
  UINT oldCP_;
 };

 static AutoWin32ConsoleOutputCP autoWin32ConsoleOutputCP(CP_UTF8);

-#endif   // _WIN32
+#endif // _WIN32
+
+using namespace tesseract;

 static void PrintVersionInfo() {
-  char* versionStrP;
+  char *versionStrP;

  printf("tesseract %s\n", tesseract::TessBaseAPI::Version());

@ -112,22 +116,18 @@ static void PrintVersionInfo() {
    printf("  Found %u platform(s).\n", num_platforms);
    for (unsigned n = 0; n < num_platforms; n++) {
      char info[256];
-      if (clGetPlatformInfo(platform[n], CL_PLATFORM_NAME, 256, info, 0) ==
-          CL_SUCCESS) {
+      if (clGetPlatformInfo(platform[n], CL_PLATFORM_NAME, 256, info, 0) == CL_SUCCESS) {
        printf("  Platform %u name: %s.\n", n + 1, info);
      }
-      if (clGetPlatformInfo(platform[n], CL_PLATFORM_VERSION, 256, info, 0) ==
-          CL_SUCCESS) {
+      if (clGetPlatformInfo(platform[n], CL_PLATFORM_VERSION, 256, info, 0) == CL_SUCCESS) {
        printf("  Version: %s.\n", info);
      }
      cl_device_id devices[2];
      cl_uint num_devices;
-      if (clGetDeviceIDs(platform[n], CL_DEVICE_TYPE_ALL, 2, devices,
-                         &num_devices) == CL_SUCCESS) {
+      if (clGetDeviceIDs(platform[n], CL_DEVICE_TYPE_ALL, 2, devices, &num_devices) == CL_SUCCESS) {
        printf("  Found %u device(s).\n", num_devices);
        for (unsigned i = 0; i < num_devices; ++i) {
-          if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 256, info, 0) ==
-              CL_SUCCESS) {
+          if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 256, info, 0) == CL_SUCCESS) {
            printf("    Device %u name: %s.\n", i + 1, info);
          }
        }
@ -135,12 +135,23 @@ static void PrintVersionInfo() {
    }
  }
 #endif
-  if (tesseract::SIMDDetect::IsAVX512BWAvailable()) printf(" Found AVX512BW\n");
-  if (tesseract::SIMDDetect::IsAVX512FAvailable()) printf(" Found AVX512F\n");
-  if (tesseract::SIMDDetect::IsAVX2Available()) printf(" Found AVX2\n");
-  if (tesseract::SIMDDetect::IsAVXAvailable()) printf(" Found AVX\n");
-  if (tesseract::SIMDDetect::IsFMAAvailable()) printf(" Found FMA\n");
-  if (tesseract::SIMDDetect::IsSSEAvailable()) printf(" Found SSE\n");
+#if defined(HAVE_NEON) || defined(__aarch64__)
+  if (tesseract::SIMDDetect::IsNEONAvailable())
+    printf(" Found NEON\n");
+#else
+  if (tesseract::SIMDDetect::IsAVX512BWAvailable())
+    printf(" Found AVX512BW\n");
+  if (tesseract::SIMDDetect::IsAVX512FAvailable())
+    printf(" Found AVX512F\n");
+  if (tesseract::SIMDDetect::IsAVX2Available())
+    printf(" Found AVX2\n");
+  if (tesseract::SIMDDetect::IsAVXAvailable())
+    printf(" Found AVX\n");
+  if (tesseract::SIMDDetect::IsFMAAvailable())
+    printf(" Found FMA\n");
+  if (tesseract::SIMDDetect::IsSSEAvailable())
+    printf(" Found SSE\n");
+#endif
 #ifdef _OPENMP
  printf(" Found OpenMP %d\n", _OPENMP);
 #endif
@ -149,19 +160,20 @@ static void PrintVersionInfo() {
  printf(" Found %s\n", archive_version_details());
 #  else
  printf(" Found %s\n", archive_version_string());
-#  endif  // ARCHIVE_VERSION_NUMBER
-#endif    // HAVE_LIBARCHIVE
+#  endif // ARCHIVE_VERSION_NUMBER
+#endif   // HAVE_LIBARCHIVE
 #if defined(HAVE_LIBCURL)
  printf(" Found %s\n", curl_version());
 #endif
 }

 static void PrintHelpForPSM() {
-  const char* msg =
+  const char *msg =
      "Page segmentation modes:\n"
      "  0    Orientation and script detection (OSD) only.\n"
      "  1    Automatic page segmentation with OSD.\n"
-      "  2    Automatic page segmentation, but no OSD, or OCR. (not implemented)\n"
+      "  2    Automatic page segmentation, but no OSD, or OCR. (not "
+      "implemented)\n"
      "  3    Fully automatic page segmentation, but no OSD. (Default)\n"
      "  4    Assume a single column of text of variable sizes.\n"
      "  5    Assume a single uniform block of vertically aligned text.\n"
@ -177,8 +189,7 @@ static void PrintHelpForPSM() {
      "       bypassing hacks that are Tesseract-specific.\n";

 #ifdef DISABLED_LEGACY_ENGINE
-  const char* disabled_osd_msg =
-      "\nNOTE: The OSD modes are currently disabled.\n";
+  const char *disabled_osd_msg = "\nNOTE: The OSD modes are currently disabled.\n";
  printf("%s%s", msg, disabled_osd_msg);
 #else
  printf("%s", msg);
@ -187,7 +198,7 @@ static void PrintHelpForPSM() {

 #ifndef DISABLED_LEGACY_ENGINE
 static void PrintHelpForOEM() {
-  const char* msg =
+  const char *msg =
      "OCR Engine modes:\n"
      "  0    Legacy engine only.\n"
      "  1    Neural nets LSTM engine only.\n"
@ -196,9 +207,9 @@ static void PrintHelpForOEM() {

  printf("%s", msg);
 }
-#endif  // ndef DISABLED_LEGACY_ENGINE
+#endif // ndef DISABLED_LEGACY_ENGINE

-static void PrintHelpExtra(const char* program) {
+static void PrintHelpExtra(const char *program) {
  printf(
      "Usage:\n"
      "  %s --help | --help-extra | --help-psm | "
@ -208,7 +219,8 @@ static void PrintHelpExtra(const char* program) {
      "--version\n"
      "  %s --list-langs [--tessdata-dir PATH]\n"
      "  %s --print-parameters [options...] [configfile...]\n"
-      "  %s imagename|imagelist|stdin outputbase|stdout [options...] [configfile...]\n"
+      "  %s imagename|imagelist|stdin outputbase|stdout [options...] "
+      "[configfile...]\n"
      "\n"
      "OCR options:\n"
      "  --tessdata-dir PATH   Specify the location of tessdata path.\n"
@ -224,8 +236,7 @@ static void PrintHelpExtra(const char* program) {
 #endif
      "NOTE: These options must occur before any configfile.\n"
      "\n",
-      program, program, program, program
-  );
+      program, program, program, program);

  PrintHelpForPSM();
 #ifndef DISABLED_LEGACY_ENGINE
@ -244,11 +255,10 @@ static void PrintHelpExtra(const char* program) {
 #endif
      "  -v, --version         Show version information.\n"
      "  --list-langs          List available languages for tesseract engine.\n"
-      "  --print-parameters    Print tesseract parameters.\n"
-  );
+      "  --print-parameters    Print tesseract parameters.\n");
 }

-static void PrintHelpMessage(const char* program) {
+static void PrintHelpMessage(const char *program) {
  printf(
      "Usage:\n"
      "  %s --help | --help-extra | --version\n"
@ -263,22 +273,23 @@ static void PrintHelpMessage(const char* program) {
      "  --help                Show this help message.\n"
      "  --help-extra          Show extra help for advanced users.\n"
      "  --version             Show version information.\n"
-      "  --list-langs          List available languages for tesseract engine.\n",
-      program, program, program
-  );
+      "  --list-langs          List available languages for tesseract "
+      "engine.\n",
+      program, program, program);
 }

-static void SetVariablesFromCLArgs(tesseract::TessBaseAPI* api, int argc,
-                                   char** argv) {
+static bool SetVariablesFromCLArgs(tesseract::TessBaseAPI *api, int argc, char **argv) {
+  bool success = true;
  char opt1[256], opt2[255];
  for (int i = 0; i < argc; i++) {
    if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) {
      strncpy(opt1, argv[i + 1], 255);
      opt1[255] = '\0';
-      char* p = strchr(opt1, '=');
+      char *p = strchr(opt1, '=');
      if (!p) {
        fprintf(stderr, "Missing = in configvar assignment\n");
-        exit(EXIT_FAILURE);
+        success = false;
+        break;
      }
      *p = 0;
      strncpy(opt2, strchr(argv[i + 1], '=') + 1, sizeof(opt2) - 1);
@ -290,15 +301,15 @@ static void SetVariablesFromCLArgs(tesseract::TessBaseAPI* api, int argc,
      }
    }
  }
+  return success;
 }

-static void PrintLangsList(tesseract::TessBaseAPI* api) {
-  GenericVector<STRING> languages;
+static void PrintLangsList(tesseract::TessBaseAPI *api) {
+  std::vector<std::string> languages;
  api->GetAvailableLanguagesAsVector(&languages);
-  printf("List of available languages (%d):\n", languages.size());
-  for (int index = 0; index < languages.size(); ++index) {
-    STRING& string = languages[index];
-    printf("%s\n", string.c_str());
+  printf("List of available languages (%zu):\n", languages.size());
+  for (const auto &language : languages) {
+    printf("%s\n", language.c_str());
  }
  api->End();
 }
@ -322,27 +333,25 @@ static void PrintBanner() {
 * It would be simpler if we could set the value before Init,
 * but that doesn't work.
 */
-static void FixPageSegMode(tesseract::TessBaseAPI* api,
-                           tesseract::PageSegMode pagesegmode) {
+static void FixPageSegMode(tesseract::TessBaseAPI *api, tesseract::PageSegMode pagesegmode) {
  if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
    api->SetPageSegMode(pagesegmode);
 }

-static void checkArgValues(int arg, const char* mode, int count) {
+static bool checkArgValues(int arg, const char *mode, int count) {
  if (arg >= count || arg < 0) {
    printf("Invalid %s value, please enter a number between 0-%d\n", mode, count - 1);
-    exit(EXIT_SUCCESS);
+    return false;
  }
+  return true;
 }

 // NOTE: arg_i is used here to avoid ugly *i so many times in this function
-static void ParseArgs(const int argc, char** argv, const char** lang,
-                      const char** image, const char** outputbase,
-                      const char** datapath, l_int32* dpi, bool* list_langs,
-                      bool* print_parameters, GenericVector<STRING>* vars_vec,
-                      GenericVector<STRING>* vars_values, l_int32* arg_i,
-                      tesseract::PageSegMode* pagesegmode,
-                      tesseract::OcrEngineMode* enginemode) {
+static bool ParseArgs(int argc, char **argv, const char **lang, const char **image,
+                      const char **outputbase, const char **datapath, l_int32 *dpi,
+                      bool *list_langs, bool *print_parameters, std::vector<std::string> *vars_vec,
+                      std::vector<std::string> *vars_values, l_int32 *arg_i,
+                      tesseract::PageSegMode *pagesegmode, tesseract::OcrEngineMode *enginemode) {
  bool noocr = false;
  int i;
  for (i = 1; i < argc && (*outputbase == nullptr || argv[i][0] == '-'); i++) {
@ -363,8 +372,7 @@ static void ParseArgs(const int argc, char** argv, const char** lang,
      PrintHelpForOEM();
      noocr = true;
 #endif
-    } else if ((strcmp(argv[i], "-v") == 0) ||
-               (strcmp(argv[i], "--version") == 0)) {
+    } else if ((strcmp(argv[i], "-v") == 0) || (strcmp(argv[i], "--version") == 0)) {
      PrintVersionInfo();
      noocr = true;
    } else if (strcmp(argv[i], "-l") == 0 && i + 1 < argc) {
@ -388,13 +396,17 @@ static void ParseArgs(const int argc, char** argv, const char** lang,
      noocr = true;
      *list_langs = true;
    } else if (strcmp(argv[i], "--psm") == 0 && i + 1 < argc) {
-      checkArgValues(atoi(argv[i+1]), "PSM", tesseract::PSM_COUNT);
+      if (!checkArgValues(atoi(argv[i + 1]), "PSM", tesseract::PSM_COUNT)) {
+        return false;
+      }
      *pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[i + 1]));
      ++i;
    } else if (strcmp(argv[i], "--oem") == 0 && i + 1 < argc) {
 #ifndef DISABLED_LEGACY_ENGINE
      int oem = atoi(argv[i + 1]);
-      checkArgValues(oem, "OEM", tesseract::OEM_COUNT);
+      if (!checkArgValues(oem, "OEM", tesseract::OEM_COUNT)) {
+        return false;
+      }
      *enginemode = static_cast<tesseract::OcrEngineMode>(oem);
 #endif
      ++i;
@ -409,7 +421,7 @@ static void ParseArgs(const int argc, char** argv, const char** lang,
    } else {
      // Unexpected argument.
      fprintf(stderr, "Error, unknown command line argument '%s'\n", argv[i]);
-      exit(EXIT_FAILURE);
+      return false;
    }
  }

@ -429,18 +441,19 @@ static void ParseArgs(const int argc, char** argv, const char** lang,

  if (*outputbase == nullptr && noocr == false) {
    PrintHelpMessage(argv[0]);
-    exit(EXIT_FAILURE);
+    return false;
  }
+
+  return true;
 }

-static void PreloadRenderers(
-    tesseract::TessBaseAPI* api,
-    tesseract::PointerVector<tesseract::TessResultRenderer>* renderers,
-    tesseract::PageSegMode pagesegmode, const char* outputbase) {
+static void PreloadRenderers(tesseract::TessBaseAPI *api,
+                             tesseract::PointerVector<tesseract::TessResultRenderer> *renderers,
+                             tesseract::PageSegMode pagesegmode, const char *outputbase) {
  if (pagesegmode == tesseract::PSM_OSD_ONLY) {
 #ifndef DISABLED_LEGACY_ENGINE
    renderers->push_back(new tesseract::TessOsdRenderer(outputbase));
-#endif  // ndef DISABLED_LEGACY_ENGINE
+#endif // ndef DISABLED_LEGACY_ENGINE
  } else {
    bool error = false;
    bool b;
@ -448,28 +461,24 @@ static void PreloadRenderers(
    if (b) {
      bool font_info;
      api->GetBoolVariable("hocr_font_info", &font_info);
-      auto* renderer =
-          new tesseract::TessHOcrRenderer(outputbase, font_info);
+      auto *renderer = new tesseract::TessHOcrRenderer(outputbase, font_info);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create hOCR output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create hOCR output file: %s\n", strerror(errno));
        error = true;
      }
    }

    api->GetBoolVariable("tessedit_create_alto", &b);
    if (b) {
-      auto* renderer =
-              new tesseract::TessAltoRenderer(outputbase);
+      auto *renderer = new tesseract::TessAltoRenderer(outputbase);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create ALTO output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create ALTO output file: %s\n", strerror(errno));
        error = true;
      }
    }
@ -478,35 +487,30 @@ static void PreloadRenderers(
    if (b) {
      bool font_info;
      api->GetBoolVariable("hocr_font_info", &font_info);
-      auto* renderer =
-          new tesseract::TessTsvRenderer(outputbase, font_info);
+      auto *renderer = new tesseract::TessTsvRenderer(outputbase, font_info);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create TSV output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create TSV output file: %s\n", strerror(errno));
        error = true;
      }
    }

    api->GetBoolVariable("tessedit_create_pdf", &b);
    if (b) {
-      #ifdef WIN32
-        if (_setmode(_fileno(stdout), _O_BINARY) == -1)
-          tprintf("ERROR: cin to binary: %s", strerror(errno));
-      #endif  // WIN32
+#ifdef WIN32
+      if (_setmode(_fileno(stdout), _O_BINARY) == -1)
+        tprintf("ERROR: cin to binary: %s", strerror(errno));
+#endif // WIN32
      bool textonly;
      api->GetBoolVariable("textonly_pdf", &textonly);
-      auto* renderer =
-        new tesseract::TessPDFRenderer(outputbase, api->GetDatapath(),
-                                       textonly);
+      auto *renderer = new tesseract::TessPDFRenderer(outputbase, api->GetDatapath(), textonly);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create PDF output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create PDF output file: %s\n", strerror(errno));
        error = true;
      }
    }
@ -514,56 +518,48 @@ static void PreloadRenderers(
    api->GetBoolVariable("tessedit_write_unlv", &b);
    if (b) {
      api->SetVariable("unlv_tilde_crunching", "true");
-      auto* renderer =
-        new tesseract::TessUnlvRenderer(outputbase);
+      auto *renderer = new tesseract::TessUnlvRenderer(outputbase);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create UNLV output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create UNLV output file: %s\n", strerror(errno));
        error = true;
      }
    }

    api->GetBoolVariable("tessedit_create_lstmbox", &b);
    if (b) {
-      auto* renderer =
-        new tesseract::TessLSTMBoxRenderer(outputbase);
+      auto *renderer = new tesseract::TessLSTMBoxRenderer(outputbase);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create LSTM BOX output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create LSTM BOX output file: %s\n", strerror(errno));
        error = true;
      }
    }

    api->GetBoolVariable("tessedit_create_boxfile", &b);
    if (b) {
-      auto* renderer =
-        new tesseract::TessBoxTextRenderer(outputbase);
+      auto *renderer = new tesseract::TessBoxTextRenderer(outputbase);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create BOX output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create BOX output file: %s\n", strerror(errno));
        error = true;
      }
    }

    api->GetBoolVariable("tessedit_create_wordstrbox", &b);
    if (b) {
-      auto* renderer =
-        new tesseract::TessWordStrBoxRenderer(outputbase);
+      auto *renderer = new tesseract::TessWordStrBoxRenderer(outputbase);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create WordStr BOX output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create WordStr BOX output file: %s\n", strerror(errno));
        error = true;
      }
    }
@ -573,14 +569,12 @@ static void PreloadRenderers(
      // Create text output if no other output was requested
      // even if text output was not explicitly requested unless
      // there was an error.
-      auto* renderer =
-        new tesseract::TessTextRenderer(outputbase);
+      auto *renderer = new tesseract::TessTextRenderer(outputbase);
      if (renderer->happy()) {
        renderers->push_back(renderer);
      } else {
        delete renderer;
-        tprintf("Error, could not create TXT output file: %s\n",
-                strerror(errno));
+        tprintf("Error, could not create TXT output file: %s\n", strerror(errno));
      }
    }
  }
@ -595,17 +589,25 @@ static void PreloadRenderers(
  }
 }

-
 /**********************************************************************
 *  main()
 *
 **********************************************************************/

-int main(int argc, char** argv) {
-  const char* lang = nullptr;
-  const char* image = nullptr;
-  const char* outputbase = nullptr;
-  const char* datapath = nullptr;
+int main(int argc, char **argv) {
+#if defined(__USE_GNU)
+  // Raise SIGFPE.
+#  if defined(__clang__)
+  // clang creates code which causes some FP exceptions, so don't enable those.
+  feenableexcept(FE_DIVBYZERO);
+#  else
+  feenableexcept(FE_DIVBYZERO | FE_OVERFLOW | FE_INVALID);
+#  endif
+#endif
+  const char *lang = nullptr;
+  const char *image = nullptr;
+  const char *outputbase = nullptr;
+  const char *datapath = nullptr;
  bool list_langs = false;
  bool print_parameters = false;
  l_int32 dpi = 0;
@ -616,13 +618,10 @@ int main(int argc, char** argv) {
 #else
  tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT;
 #endif
-  /* main() calls functions like ParseArgs which call exit().
-   * This results in memory leaks if vars_vec and vars_values are
-   * declared as auto variables (destructor is not called then). */
-  static GenericVector<STRING> vars_vec;
-  static GenericVector<STRING> vars_values;
+  std::vector<std::string> vars_vec;
+  std::vector<std::string> vars_values;

-#if !defined(DEBUG)
+#if defined(NDEBUG)
  // Disable debugging and informational messages from Leptonica.
  setMsgSeverity(L_SEVERITY_ERROR);
 #endif
@ -633,9 +632,10 @@ int main(int argc, char** argv) {
  TIFFSetWarningHandler(Win32WarningHandler);
 #endif // HAVE_TIFFIO_H && _WIN32

-  ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &dpi,
-            &list_langs, &print_parameters, &vars_vec, &vars_values, &arg_i,
-            &pagesegmode, &enginemode);
+  if (!ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &dpi, &list_langs,
+                 &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode, &enginemode)) {
+    return EXIT_FAILURE;
+  }

  if (lang == nullptr) {
    // Set default language if none was given.
@ -650,15 +650,16 @@ int main(int argc, char** argv) {
  // first TessBaseAPI must be destructed, DawgCache must be the last object.
  tesseract::Dict::GlobalDawgCache();

-  // Avoid memory leak caused by auto variable when return is called.
-  static tesseract::TessBaseAPI api;
+  tesseract::TessBaseAPI api;

  api.SetOutputName(outputbase);

-  const int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
-                             argc - arg_i, &vars_vec, &vars_values, false);
+  const int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]), argc - arg_i,
+                                   &vars_vec, &vars_values, false);

-  SetVariablesFromCLArgs(&api, argc, argv);
+  if (!SetVariablesFromCLArgs(&api, argc, argv)) {
+    return EXIT_FAILURE;
+  }

  // SIMD settings might be overridden by config variable.
  tesseract::SIMDDetect::Update();
@ -674,7 +675,7 @@ int main(int argc, char** argv) {
  }

  if (print_parameters) {
-    FILE* fout = stdout;
+    FILE *fout = stdout;
    fprintf(stdout, "Tesseract parameters:\n");
    api.PrintVariables(fout);
    api.End();
@ -692,7 +693,7 @@ int main(int argc, char** argv) {
  if (pagesegmode == tesseract::PSM_AUTO_ONLY) {
    int ret_val = EXIT_SUCCESS;

-    Pix* pixs = pixRead(image);
+    Pix *pixs = pixRead(image);
    if (!pixs) {
      fprintf(stderr, "Leptonica can't process input file: %s\n", image);
      return 2;
@ -705,7 +706,7 @@ int main(int argc, char** argv) {
    tesseract::TextlineOrder order;
    float deskew_angle;

-    const tesseract::PageIterator* it = api.AnalyseLayout();
+    const tesseract::PageIterator *it = api.AnalyseLayout();
    if (it) {
      // TODO: Implement output of page segmentation, see documentation
      // ("Automatic page segmentation, but no OSD, or OCR").
@ -728,35 +729,36 @@ int main(int argc, char** argv) {
  // ambigs.train, box.train, box.train.stderr, linebox, rebox, lstm.train.
  // In this mode no other OCR result files are written.
  bool b = false;
-  bool in_training_mode =
-      (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
-      (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
-      (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b) ||
-      (api.GetBoolVariable("tessedit_train_line_recognizer", &b) && b);
+  bool in_training_mode = (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
+                          (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
+                          (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b) ||
+                          (api.GetBoolVariable("tessedit_train_line_recognizer", &b) && b);

 #ifdef DISABLED_LEGACY_ENGINE
  auto cur_psm = api.GetPageSegMode();
  auto osd_warning = std::string("");
  if (cur_psm == tesseract::PSM_OSD_ONLY) {
-    const char* disabled_osd_msg =
-        "\nERROR: The page segmentation mode 0 (OSD Only) is currently disabled.\n\n";
-    fprintf(stderr, "%s",  disabled_osd_msg);
+    const char *disabled_osd_msg =
+        "\nERROR: The page segmentation mode 0 (OSD Only) is currently "
+        "disabled.\n\n";
+    fprintf(stderr, "%s", disabled_osd_msg);
    return EXIT_FAILURE;
  } else if (cur_psm == tesseract::PSM_AUTO_OSD) {
-      api.SetPageSegMode(tesseract::PSM_AUTO);
-      osd_warning +=
-          "\nWarning: The page segmentation mode 1 (Auto+OSD) is currently disabled. "
-          "Using PSM 3 (Auto) instead.\n\n";
+    api.SetPageSegMode(tesseract::PSM_AUTO);
+    osd_warning +=
+        "\nWarning: The page segmentation mode 1 (Auto+OSD) is currently "
+        "disabled. "
+        "Using PSM 3 (Auto) instead.\n\n";
  } else if (cur_psm == tesseract::PSM_SPARSE_TEXT_OSD) {
-      api.SetPageSegMode(tesseract::PSM_SPARSE_TEXT);
-      osd_warning +=
-          "\nWarning: The page segmentation mode 12 (Sparse text + OSD) is currently disabled. "
-          "Using PSM 11 (Sparse text) instead.\n\n";
+    api.SetPageSegMode(tesseract::PSM_SPARSE_TEXT);
+    osd_warning +=
+        "\nWarning: The page segmentation mode 12 (Sparse text + OSD) is "
+        "currently disabled. "
+        "Using PSM 11 (Sparse text) instead.\n\n";
  }
-#endif  // def DISABLED_LEGACY_ENGINE
+#endif // def DISABLED_LEGACY_ENGINE

-  // Avoid memory leak caused by auto variable when exit() is called.
-  static tesseract::PointerVector<tesseract::TessResultRenderer> renderers;
+  tesseract::PointerVector<tesseract::TessResultRenderer> renderers;

  if (in_training_mode) {
    renderers.push_back(nullptr);
@ -765,16 +767,16 @@ int main(int argc, char** argv) {
  }

  bool banner = false;
-  if (outputbase != nullptr && strcmp(outputbase, "-") &&
-      strcmp(outputbase, "stdout")) {
+  if (outputbase != nullptr && strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
    banner = true;
  }

  if (!renderers.empty()) {
-    if (banner) PrintBanner();
+    if (banner)
+      PrintBanner();
 #ifdef DISABLED_LEGACY_ENGINE
    if (!osd_warning.empty()) {
-      fprintf(stderr, "%s",osd_warning.c_str());
+      fprintf(stderr, "%s", osd_warning.c_str());
    }
 #endif
    bool succeed = api.ProcessPages(image, nullptr, 0, renderers[0]);
--- a/src/api/wordstrboxrenderer.cpp
+++ b/src/api/wordstrboxrenderer.cpp
@ -16,9 +16,9 @@
 *
 **********************************************************************/

-#include <tesseract/baseapi.h>  // for TessBaseAPI
+#include <tesseract/baseapi.h> // for TessBaseAPI
 #include <tesseract/renderer.h>
-#include "tesseractclass.h"  // for Tesseract
+#include "tesseractclass.h" // for Tesseract

 namespace tesseract {

@ -28,16 +28,16 @@ namespace tesseract {
 * file. Returned string must be freed with the delete [] operator.
 */

-char* TessBaseAPI::GetWordStrBoxText(int page_number=0) {
+char *TessBaseAPI::GetWordStrBoxText(int page_number = 0) {
  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
    return nullptr;

-  STRING wordstr_box_str("");
+  std::string wordstr_box_str;
  int left = 0, top = 0, right = 0, bottom = 0;

  bool first_line = true;

-  LTRResultIterator* res_it = GetLTRIterator();
+  LTRResultIterator *res_it = GetLTRIterator();
  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->Empty(RIL_WORD)) {
      res_it->Next(RIL_WORD);
@ -46,41 +46,40 @@ char* TessBaseAPI::GetWordStrBoxText(int page_number=0) {

    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
      if (!first_line) {
-        wordstr_box_str.add_str_int("\n\t ", right + 1);
-        wordstr_box_str.add_str_int(" ", image_height_ - bottom);
-        wordstr_box_str.add_str_int(" ", right + 5);
-        wordstr_box_str.add_str_int(" ", image_height_ - top);
-        wordstr_box_str.add_str_int(" ", page_number);  // row for tab for EOL
+        wordstr_box_str += "\n\t " + std::to_string(right + 1);
+        wordstr_box_str += " " + std::to_string(image_height_ - bottom);
+        wordstr_box_str += " " + std::to_string(right + 5);
+        wordstr_box_str += " " + std::to_string(image_height_ - top);
+        wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
        wordstr_box_str += "\n";
      } else {
        first_line = false;
      }
-     // Use bounding box for whole line for WordStr
-     res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
-      wordstr_box_str.add_str_int("WordStr ", left);
-      wordstr_box_str.add_str_int(" ", image_height_ - bottom);
-      wordstr_box_str.add_str_int(" ", right);
-      wordstr_box_str.add_str_int(" ", image_height_ - top);
-      wordstr_box_str.add_str_int(" ", page_number);  // word
+      // Use bounding box for whole line for WordStr
+      res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
+      wordstr_box_str += "WordStr " + std::to_string(left);
+      wordstr_box_str += " " + std::to_string(image_height_ - bottom);
+      wordstr_box_str += " " + std::to_string(right);
+      wordstr_box_str += " " + std::to_string(image_height_ - top);
+      wordstr_box_str += " " + std::to_string(page_number); // word
      wordstr_box_str += " #";
    }
    do {
-      wordstr_box_str +=
-          std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
+      wordstr_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
      wordstr_box_str += " ";
      res_it->Next(RIL_WORD);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
  }

  if (left != 0 && top != 0 && right != 0 && bottom != 0) {
-    wordstr_box_str.add_str_int("\n\t ", right + 1);
-    wordstr_box_str.add_str_int(" ", image_height_ - bottom);
-    wordstr_box_str.add_str_int(" ", right + 5);
-    wordstr_box_str.add_str_int(" ", image_height_ - top);
-    wordstr_box_str.add_str_int(" ", page_number);  // row for tab for EOL
+    wordstr_box_str += "\n\t " + std::to_string(right + 1);
+    wordstr_box_str += " " + std::to_string(image_height_ - bottom);
+    wordstr_box_str += " " + std::to_string(right + 5);
+    wordstr_box_str += " " + std::to_string(image_height_ - top);
+    wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
    wordstr_box_str += "\n";
  }
-  char* ret = new char[wordstr_box_str.length() + 1];
+  char *ret = new char[wordstr_box_str.length() + 1];
  strcpy(ret, wordstr_box_str.c_str());
  delete res_it;
  return ret;
@ -89,17 +88,17 @@ char* TessBaseAPI::GetWordStrBoxText(int page_number=0) {
 /**********************************************************************
 * WordStrBox Renderer interface implementation
 **********************************************************************/
-TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char* outputbase)
+TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
    : TessResultRenderer(outputbase, "box") {}

-bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI* api) {
-  const std::unique_ptr<const char[]> wordstrbox(
-      api->GetWordStrBoxText(imagenum()));
-  if (wordstrbox == nullptr) return false;
+bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
+  if (wordstrbox == nullptr)
+    return false;

  AppendString(wordstrbox.get());

  return true;
 }

-}  // namespace tesseract.
+} // namespace tesseract.
--- a/src/arch/dotproduct.cpp
+++ b/src/arch/dotproduct.cpp
@ -19,10 +19,11 @@
 namespace tesseract {

 // Computes and returns the dot product of the two n-vectors u and v.
-double DotProductNative(const double* u, const double* v, int n) {
+double DotProductNative(const double *u, const double *v, int n) {
  double total = 0.0;
-  for (int k = 0; k < n; ++k) total += u[k] * v[k];
+  for (int k = 0; k < n; ++k)
+    total += u[k] * v[k];
  return total;
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/arch/dotproduct.h
+++ b/src/arch/dotproduct.h
@ -20,17 +20,17 @@
 namespace tesseract {

 // Computes and returns the dot product of the n-vectors u and v.
-double DotProductNative(const double* u, const double* v, int n);
+double DotProductNative(const double *u, const double *v, int n);

 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-double DotProductAVX(const double* u, const double* v, int n);
+double DotProductAVX(const double *u, const double *v, int n);

 // Use Intel FMA.
-double DotProductFMA(const double* u, const double* v, int n);
+double DotProductFMA(const double *u, const double *v, int n);

 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-double DotProductSSE(const double* u, const double* v, int n);
+double DotProductSSE(const double *u, const double *v, int n);

-}  // namespace tesseract.
+} // namespace tesseract.

-#endif  // TESSERACT_ARCH_DOTPRODUCT_H_
+#endif // TESSERACT_ARCH_DOTPRODUCT_H_
--- a/src/arch/dotproductavx.cpp
+++ b/src/arch/dotproductavx.cpp
@ -16,18 +16,20 @@
 ///////////////////////////////////////////////////////////////////////

 #if !defined(__AVX__)
-#error Implementation only for AVX capable architectures
-#endif
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for AVX capable architectures
+#  endif
+#else

-#include <immintrin.h>
-#include <cstdint>
-#include "dotproduct.h"
+#  include <immintrin.h>
+#  include <cstdint>
+#  include "dotproduct.h"

 namespace tesseract {

 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-double DotProductAVX(const double* u, const double* v, int n) {
+double DotProductAVX(const double *u, const double *v, int n) {
  const unsigned quot = n / 8;
  const unsigned rem = n % 8;
  __m256d t0 = _mm256_setzero_pd();
@ -56,4 +58,6 @@ double DotProductAVX(const double* u, const double* v, int n) {
  return result;
 }

-}  // namespace tesseract.
+} // namespace tesseract.
+
+#endif
--- a/src/arch/dotproductfma.cpp
+++ b/src/arch/dotproductfma.cpp
@ -16,18 +16,20 @@
 ///////////////////////////////////////////////////////////////////////

 #if !defined(__FMA__)
-#error Implementation only for FMA capable architectures
-#endif
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for FMA capable architectures
+#  endif
+#else

-#include <immintrin.h>
-#include <cstdint>
-#include "dotproduct.h"
+#  include <immintrin.h>
+#  include <cstdint>
+#  include "dotproduct.h"

 namespace tesseract {

 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
-double DotProductFMA(const double* u, const double* v, int n) {
+double DotProductFMA(const double *u, const double *v, int n) {
  const unsigned quot = n / 8;
  const unsigned rem = n % 8;
  __m256d t0 = _mm256_setzero_pd();
@ -54,4 +56,6 @@ double DotProductFMA(const double* u, const double* v, int n) {
  return result;
 }

-}  // namespace tesseract.
+} // namespace tesseract.
+
+#endif
--- a/src/arch/dotproductsse.cpp
+++ b/src/arch/dotproductsse.cpp
@ -16,19 +16,21 @@
 ///////////////////////////////////////////////////////////////////////

 #if !defined(__SSE4_1__)
-#error Implementation only for SSE 4.1 capable architectures
-#endif
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for SSE 4.1 capable architectures
+#  endif
+#else

-#include <emmintrin.h>
-#include <smmintrin.h>
-#include <cstdint>
-#include "dotproduct.h"
+#  include <emmintrin.h>
+#  include <smmintrin.h>
+#  include <cstdint>
+#  include "dotproduct.h"

 namespace tesseract {

 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-double DotProductSSE(const double* u, const double* v, int n) {
+double DotProductSSE(const double *u, const double *v, int n) {
  int max_offset = n - 2;
  int offset = 0;
  // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
@ -37,8 +39,7 @@ double DotProductSSE(const double* u, const double* v, int n) {
  if (offset <= max_offset) {
    offset = 2;
    // Aligned load is reputedly faster but requires 16 byte aligned input.
-    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
-        (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
+    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
      // Use aligned load.
      sum = _mm_load_pd(u);
      __m128d floats2 = _mm_load_pd(v);
@ -78,4 +79,6 @@ double DotProductSSE(const double* u, const double* v, int n) {
  return result;
 }

-}  // namespace tesseract.
+} // namespace tesseract.
+
+#endif
--- a/src/arch/intsimdmatrix.cpp
+++ b/src/arch/intsimdmatrix.cpp
@ -2,7 +2,6 @@
 // File:        intsimdmatrix.cpp
 // Description: Base class for 8-bit int SIMD matrix multipliers.
 // Author:      Ray Smith
-// Created:     Tue Aug 15 08:01:32 PST 2017
 //
 // (C) Copyright 2017, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -17,33 +16,30 @@
 ///////////////////////////////////////////////////////////////////////

 #include "intsimdmatrix.h"
-#include <tesseract/genericvector.h>      // for GenericVector
-#include "matrix.h"             // for GENERIC_2D_ARRAY
-#include "simddetect.h"         // for SIMDDetect
+#include "matrix.h"     // for GENERIC_2D_ARRAY
+#include "simddetect.h" // for SIMDDetect

 namespace tesseract {

-const IntSimdMatrix* IntSimdMatrix::intSimdMatrix = nullptr;
+const IntSimdMatrix *IntSimdMatrix::intSimdMatrix = nullptr;

 // Computes a reshaped copy of the weight matrix w.
-void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t>& w,
-                         std::vector<int8_t>& shaped_w) const {
+void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,
+                         int32_t &rounded_num_out) const {
  const int num_out = w.dim1();
  const int num_in = w.dim2() - 1;
  // The rounded-up sizes of the reshaped weight matrix, excluding biases.
  int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
-  int rounded_num_out = RoundOutputs(num_out);
+  rounded_num_out = RoundOutputs(num_out);
  // Add the bias and compute the required size.
  shaped_w.resize((rounded_num_in + 1) * rounded_num_out, 0);
  int shaped_index = 0;
  int output = 0;
  // Each number of registers needs a different format! Iterates over the
  // different numbers of registers (each a power of 2).
-  for (int num_registers = max_output_registers_; num_registers >= 1;
-       num_registers /= 2) {
+  for (int num_registers = max_output_registers_; num_registers >= 1; num_registers /= 2) {
    // The number of outputs that we will generate with this many registers.
-    int num_outputs_per_register_set =
-        num_registers * num_outputs_per_register_;
+    int num_outputs_per_register_set = num_registers * num_outputs_per_register_;
    // Use the max number of registers until we have to go fewer.
    while (output + num_outputs_per_register_set <= rounded_num_out) {
      // Accumulating outputs in registers saves iterating over the inputs, so
@ -64,7 +60,8 @@ void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t>& w,
      // Append the bias weights for the register set.
      for (int j = 0; j < num_outputs_per_register_set; ++j) {
        int8_t weight = 0;
-        if (output + j < num_out) weight = w(output + j, num_in);
+        if (output + j < num_out)
+          weight = w(output + j, num_in);
        shaped_w[shaped_index++] = weight;
      }
      output += num_outputs_per_register_set;
@ -76,19 +73,19 @@ void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t>& w,
 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
 // u is imagined to have an extra element at the end with value 1, to
 // implement the bias, but it doesn't actually have it.
-void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
-                                    const GenericVector<double>& scales,
-                                    const int8_t* u, double* v) {
+void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
+                                    const std::vector<double> &scales, const int8_t *u, double *v) {
  int num_out = w.dim1();
  int num_in = w.dim2() - 1;
  // Base implementation.
  for (int i = 0; i < num_out; ++i) {
-    const int8_t* wi = w[i];
+    const int8_t *wi = w[i];
    int total = 0;
-    for (int j = 0; j < num_in; ++j) total += wi[j] * u[j];
+    for (int j = 0; j < num_in; ++j)
+      total += wi[j] * u[j];
    // Add in the bias and correct for integer values.
-    v[i] = (static_cast<double>(total) / INT8_MAX + wi[num_in]) * scales[i];
+    v[i] = (total + wi[num_in] * INT8_MAX) * scales[i];
  }
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/arch/intsimdmatrix.h
+++ b/src/arch/intsimdmatrix.h
@ -2,7 +2,6 @@
 // File:        intsimdmatrix.h
 // Description: Base class for 8-bit int SIMD matrix multipliers.
 // Author:      Ray Smith
-// Created:     Tue Aug 15 07:37:20 PST 2017
 //
 // (C) Copyright 2017, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -19,15 +18,15 @@
 #ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
 #define TESSERACT_ARCH_INTSIMDMATRIX_H_

+#include <tesseract/export.h>
+
 #include <cstdint>
 #include <vector>

+namespace tesseract {
+
 template <class T>
 class GENERIC_2D_ARRAY;
-template <typename T>
-class GenericVector;
-
-namespace tesseract {

 // Base class for a SIMD function to multiply a matrix by a vector, with sources
 // of 8-bit signed integer, and result in a double, after appropriate scaling.
@ -60,10 +59,10 @@ namespace tesseract {
 // NOTE that, although the subclasses execute on different SIMD hardware, no
 // virtual methods are needed, as the constructor sets up everything that
 // is required to allow the base class implementation to do all the work.
-struct IntSimdMatrix {
+struct TESS_API IntSimdMatrix {
  // Computes a reshaped copy of the weight matrix w.
-  void Init(const GENERIC_2D_ARRAY<int8_t>& w,
-            std::vector<int8_t>& shaped_w) const;
+  void Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,
+            int32_t &rounded_num_out) const;

  // Rounds the size up to a multiple of the input register size (in int8_t).
  int RoundInputs(int size) const {
@ -79,9 +78,8 @@ struct IntSimdMatrix {
  // u is imagined to have an extra element at the end with value 1, to
  // implement the bias, but it doesn't actually have it.
  // Computes the base C++ implementation.
-  static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
-                              const GenericVector<double>& scales,
-                              const int8_t* u, double* v);
+  static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<double> &scales,
+                              const int8_t *u, double *v);

  // Rounds the input up to a multiple of the given factor.
  static int Roundup(int input, int factor) {
@ -97,9 +95,8 @@ struct IntSimdMatrix {
  // RoundInputs above.
  // The input will be over-read to the extent of the padding. There are no
  // alignment requirements.
-  using MatrixDotVectorFunction = void (*)(int, int, const int8_t*,
-                                           const double*, const int8_t*,
-                                           double*);
+  using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const double *, const int8_t *,
+                                           double *);
  MatrixDotVectorFunction matrixDotVectorFunction;

  // Number of 32 bit outputs held in each register.
@ -113,11 +110,14 @@ struct IntSimdMatrix {
  // Number of groups of inputs to be broadcast.
  // num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_

-  static const IntSimdMatrix* intSimdMatrix;
+  static const IntSimdMatrix *intSimdMatrix;
+  // Only available with NEON.
+  static const IntSimdMatrix intSimdMatrixNEON;
+  // Only available with AVX2 / SSE.
  static const IntSimdMatrix intSimdMatrixAVX2;
  static const IntSimdMatrix intSimdMatrixSSE;
 };

-}  // namespace tesseract
+} // namespace tesseract

-#endif  // TESSERACT_ARCH_INTSIMDMATRIX_H_
+#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_
--- a/src/arch/intsimdmatrixavx2.cpp
+++ b/src/arch/intsimdmatrixavx2.cpp
@ -2,7 +2,6 @@
 // File:        intsimdmatrixavx2.cpp
 // Description: matrix-vector product for 8-bit data on avx2.
 // Author:      Ray Smith
-// Created:     Fri Aug 04 13:26:20 PST 2017
 //
 // (C) Copyright 2017, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -17,15 +16,17 @@
 ///////////////////////////////////////////////////////////////////////

 #if !defined(__AVX2__)
-#error Implementation only for AVX2 capable architectures
-#endif
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for AVX2 capable architectures
+#  endif
+#else

-#include "intsimdmatrix.h"
+#  include "intsimdmatrix.h"

-#include <immintrin.h>
-#include <cstdint>
-#include <algorithm>
-#include <vector>
+#  include <immintrin.h>
+#  include <algorithm>
+#  include <cstdint>
+#  include <vector>

 namespace tesseract {

@ -57,11 +58,10 @@ constexpr int kNumInputGroups = kNumInputsPerRegister / kNumInputsPerGroup;
 // weights and reps are scratch registers.
 // This function must be inlined with references in order for the compiler to
 // correctly use the registers declared in the caller.
-static inline void MultiplyGroup(const __m256i& rep_input, const __m256i& ones,
-                                 const int8_t*& wi, __m256i& weights,
-                                 __m256i& reps, __m256i& result) {
+static inline void MultiplyGroup(const __m256i &rep_input, const __m256i &ones, const int8_t *&wi,
+                                 __m256i &weights, __m256i &reps, __m256i &result) {
  // Load a 4x8 block of weights.
-  weights = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(wi));
+  weights = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(wi));
  wi += kNumInputsPerRegister;
  // Normalize the signs on rep_input, weights, so weights is always +ve.
  reps = _mm256_sign_epi8(rep_input, weights);
@ -78,24 +78,65 @@ static inline void MultiplyGroup(const __m256i& rep_input, const __m256i& ones,
  result = _mm256_add_epi32(result, weights);
 }

-// Extracts and converts 8x32-bit results from result, adding the bias from wi
-// and scaling by scales, before storing in *v. Note that wi, scales and v are
-// expected to contain 8 consecutive elements or num_out if less.
-static inline void ExtractResults(__m256i& result, __m256i& shift_id,
-                                  const int8_t*& wi, const double*& scales,
-                                  int num_out, double*& v) {
-  for (int out = 0; out < num_out; ++out) {
-#ifndef _MSC_VER
-    auto res = _mm256_extract_epi32(result, 0);
-#else
-    // Workaround MSVC's ICE
-    // _mm256_extract_epi32(X, Y) == ((int32_t*)&X)[Y]
-    auto res = ((int32_t*)&result)[0];
-#endif
-    *v++ = (static_cast<double>(res) / INT8_MAX + *wi++) * *scales++;
-    // Rotate the results in int32_t units, so the next result is ready.
-    result = _mm256_permutevar8x32_epi32(result, shift_id);
-  }
+// Load 64 bits into the bottom of a 128bit register.
+// We don't actually care what the top 64bits are, but this ends
+// up with them being zero.
+static inline __m128i load64_to_128(const int8_t *wi_) {
+  const int64_t *wi = reinterpret_cast<const int64_t *>(wi_);
+  return _mm_set_epi64x(0, wi[0]);
+}
+
+static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales,
+                                   double *v) {
+  __m128i w128 = load64_to_128(wi);          // 8x8bit vals in bottom of 128bit reg
+  __m256i w256 = _mm256_cvtepi8_epi32(w128); // 8x32bit vals in 256bit reg
+  __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);
+  __m256d scale0123 = _mm256_loadu_pd(scales);
+  __m256d scale4567 = _mm256_loadu_pd(scales + 4);
+  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
+  result = _mm256_add_epi32(result, w256);     // result += bias * 127
+  __m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));
+  result = _mm256_permute4x64_epi64(result, 2 + (3 << 2));
+  __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));
+  res0123 = _mm256_mul_pd(res0123, scale0123);
+  res4567 = _mm256_mul_pd(res4567, scale4567);
+  _mm256_storeu_pd(v, res0123);
+  _mm256_storeu_pd(v + 4, res4567);
+}
+
+static inline void ExtractResults16(__m256i result0, __m256i result1, const int8_t *&wi,
+                                    const double *&scales, double *&v) {
+  __m128i w8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(wi));
+  // 8x8bit vals in bottom of 128bit reg
+  const __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);
+  __m256i w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg
+  __m256d scale0123 = _mm256_loadu_pd(scales);
+  __m256d scale4567 = _mm256_loadu_pd(scales + 4);
+  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
+  result0 = _mm256_add_epi32(result0, w256);   // result += bias * 127
+  __m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));
+  result0 = _mm256_permute4x64_epi64(result0, 2 + (3 << 2));
+  __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));
+  res0123 = _mm256_mul_pd(res0123, scale0123);
+  res4567 = _mm256_mul_pd(res4567, scale4567);
+  _mm256_storeu_pd(v, res0123);
+  _mm256_storeu_pd(v + 4, res4567);
+  w8 = _mm_shuffle_epi32(w8, 2 + (3 << 2));
+  w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg
+  scale0123 = _mm256_loadu_pd(scales + 8);
+  scale4567 = _mm256_loadu_pd(scales + 12);
+  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
+  result1 = _mm256_add_epi32(result1, w256);   // result += bias * 127
+  res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));
+  result1 = _mm256_permute4x64_epi64(result1, 2 + (3 << 2));
+  res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));
+  res0123 = _mm256_mul_pd(res0123, scale0123);
+  res4567 = _mm256_mul_pd(res4567, scale4567);
+  _mm256_storeu_pd(v + 8, res0123);
+  _mm256_storeu_pd(v + 12, res4567);
+  wi += 16;
+  scales += 16;
+  v += 16;
 }

 // Computes part of matrix.vector v = Wu. Computes N=64 results.
@ -105,13 +146,11 @@ static inline void ExtractResults(__m256i& result, __m256i& shift_id,
 // bias weights, before continuing with any more weights.
 // u must be padded out with zeros to
 // kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
-static void PartialMatrixDotVector64(const int8_t* wi, const double* scales,
-                                     const int8_t* u, int num_in, int num_out,
-                                     double* v) {
+static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u,
+                                     int num_in, double *v) {
  // Register containing 16-bit ones for horizontal add with 16->32 bit
  // conversion.
-  __m256i ones =
-      _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
  // Initialize all the results to 0.
  __m256i result0 = _mm256_setzero_si256();
@ -124,15 +163,12 @@ static void PartialMatrixDotVector64(const int8_t* wi, const double* scales,
  __m256i result7 = _mm256_setzero_si256();
  // Iterate over the input (u), one registerful at a time.
  for (int j = 0; j < num_in;) {
-    __m256i inputs =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j));
+    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
    // Inputs are processed in groups of kNumInputsPerGroup, replicated
    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in;
-         ++ig, j += kNumInputsPerGroup) {
+    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input =
-          _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
+      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
      __m256i weights, reps;
@ -147,27 +183,19 @@ static void PartialMatrixDotVector64(const int8_t* wi, const double* scales,
      MultiplyGroup(rep_input, ones, wi, weights, reps, result7);
    }
  }
-  ExtractResults(result0, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  ExtractResults(result1, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  ExtractResults(result2, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  ExtractResults(result3, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  ExtractResults(result4, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  ExtractResults(result5, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  ExtractResults(result6, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  num_out -= kNumOutputsPerRegister * 7;
-  ExtractResults(result7, shift_id, wi, scales,
-                 std::min(kNumOutputsPerRegister, num_out), v);
+  ExtractResults16(result0, result1, wi, scales, v);
+  ExtractResults16(result2, result3, wi, scales, v);
+  ExtractResults16(result4, result5, wi, scales, v);
+  ExtractResults16(result6, result7, wi, scales, v);
 }

 // Computes part of matrix.vector v = Wu. Computes N=32 results.
 // For details see PartialMatrixDotVector64 with N=32.
-static void PartialMatrixDotVector32(const int8_t* wi, const double* scales,
-                                     const int8_t* u, int num_in, int num_out,
-                                     double* v) {
+static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u,
+                                     int num_in, double *v) {
  // Register containing 16-bit ones for horizontal add with 16->32 bit
  // conversion.
-  __m256i ones =
-      _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
  // Initialize all the results to 0.
  __m256i result0 = _mm256_setzero_si256();
@ -176,15 +204,12 @@ static void PartialMatrixDotVector32(const int8_t* wi, const double* scales,
  __m256i result3 = _mm256_setzero_si256();
  // Iterate over the input (u), one registerful at a time.
  for (int j = 0; j < num_in;) {
-    __m256i inputs =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j));
+    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
    // Inputs are processed in groups of kNumInputsPerGroup, replicated
    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in;
-         ++ig, j += kNumInputsPerGroup) {
+    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input =
-          _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
+      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
      __m256i weights, reps;
@ -195,38 +220,29 @@ static void PartialMatrixDotVector32(const int8_t* wi, const double* scales,
      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
    }
  }
-  ExtractResults(result0, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  ExtractResults(result1, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  ExtractResults(result2, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  num_out -= kNumOutputsPerRegister * 3;
-  ExtractResults(result3, shift_id, wi, scales,
-                 std::min(kNumOutputsPerRegister, num_out), v);
+  ExtractResults16(result0, result1, wi, scales, v);
+  ExtractResults16(result2, result3, wi, scales, v);
 }

 // Computes part of matrix.vector v = Wu. Computes N=16 results.
 // For details see PartialMatrixDotVector64 with N=16.
-static void PartialMatrixDotVector16(const int8_t* wi, const double* scales,
-                                     const int8_t* u, int num_in, int num_out,
-                                     double* v) {
+static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u,
+                                     int num_in, double *v) {
  // Register containing 16-bit ones for horizontal add with 16->32 bit
  // conversion.
-  __m256i ones =
-      _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
  // Initialize all the results to 0.
  __m256i result0 = _mm256_setzero_si256();
  __m256i result1 = _mm256_setzero_si256();
  // Iterate over the input (u), one registerful at a time.
  for (int j = 0; j < num_in;) {
-    __m256i inputs =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j));
+    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
    // Inputs are processed in groups of kNumInputsPerGroup, replicated
    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in;
-         ++ig, j += kNumInputsPerGroup) {
+    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input =
-          _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
+      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
      __m256i weights, reps;
@ -235,35 +251,27 @@ static void PartialMatrixDotVector16(const int8_t* wi, const double* scales,
      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
    }
  }
-  ExtractResults(result0, shift_id, wi, scales, kNumOutputsPerRegister, v);
-  num_out -= kNumOutputsPerRegister;
-  ExtractResults(result1, shift_id, wi, scales,
-                 std::min(kNumOutputsPerRegister, num_out), v);
+  ExtractResults16(result0, result1, wi, scales, v);
 }

 // Computes part of matrix.vector v = Wu. Computes N=8 results.
 // For details see PartialMatrixDotVector64 with N=8.
-static void PartialMatrixDotVector8(const int8_t* wi, const double* scales,
-                                    const int8_t* u, int num_in, int num_out,
-                                    double* v) {
+static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u,
+                                           int num_in, double *v) {
  // Register containing 16-bit ones for horizontal add with 16->32 bit
  // conversion.
-  __m256i ones =
-      _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
  // Initialize all the results to 0.
  __m256i result0 = _mm256_setzero_si256();
  // Iterate over the input (u), one registerful at a time.
  for (int j = 0; j < num_in;) {
-    __m256i inputs =
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j));
+    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
    // Inputs are processed in groups of kNumInputsPerGroup, replicated
    // kNumInputGroups times.
-    for (int ig = 0; ig < kNumInputGroups && j < num_in;
-         ++ig, j += kNumInputsPerGroup) {
+    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
      // Replicate the low 32 bits (4 inputs) 8 times.
-      __m256i rep_input =
-          _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
+      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
      __m256i weights, reps;
@ -271,19 +279,17 @@ static void PartialMatrixDotVector8(const int8_t* wi, const double* scales,
      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
    }
  }
-  ExtractResults(result0, shift_id, wi, scales, num_out, v);
+  ExtractResults8(result0, wi, scales, v);
 }

-static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
-                            const double* scales, const int8_t* u, double* v) {
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
+                            const int8_t *u, double *v) {
  const int num_out = dim1;
  const int num_in = dim2 - 1;
  // Each call to a partial_func_ produces group_size outputs, except the
  // last one, which can produce less.
-  const int rounded_num_in =
-    IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
-  const int rounded_num_out =
-    IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);
+  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
+  const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);
  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
  int output = 0;

@ -292,7 +298,7 @@ static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
  // Run with this group size, until it would produce too much output, then
  // switch to a smaller size.
  for (; output + group_size <= rounded_num_out; output += group_size) {
-    PartialMatrixDotVector64(wi, scales, u, rounded_num_in, num_out - output, v);
+    PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v);
    wi += w_step;
    scales += group_size;
    v += group_size;
@ -300,43 +306,42 @@ static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
  group_size /= 2;
  w_step /= 2;

-  for (; output + group_size <= rounded_num_out; output += group_size) {
-    PartialMatrixDotVector32(wi, scales, u, rounded_num_in, num_out - output, v);
+  if (output + group_size <= rounded_num_out) {
+    PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v);
    wi += w_step;
    scales += group_size;
    v += group_size;
+    output += group_size;
  }
  group_size /= 2;
  w_step /= 2;

-  for (; output + group_size <= rounded_num_out; output += group_size) {
-    PartialMatrixDotVector16(wi, scales, u, rounded_num_in, num_out - output, v);
+  if (output + group_size <= rounded_num_out) {
+    PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v);
    wi += w_step;
    scales += group_size;
    v += group_size;
+    output += group_size;
  }
  group_size /= 2;
  w_step /= 2;

-  for (; output + group_size <= rounded_num_out; output += group_size) {
-    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, num_out - output, v);
-    wi += w_step;
-    scales += group_size;
-    v += group_size;
-  }
+  if (output + group_size <= rounded_num_out)
+    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
 }

 const IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = {
-  // Function.
-  matrixDotVector,
-  // Number of 32 bit outputs held in each register.
-  kNumOutputsPerRegister,
-  // Maximum number of registers that we will use to hold outputs.
-  kMaxOutputRegisters,
-  // Number of 8 bit inputs in the inputs register.
-  kNumInputsPerRegister,
-  // Number of inputs in each weight group.
-  kNumInputsPerGroup
-};
+    // Function.
+    matrixDotVector,
+    // Number of 32 bit outputs held in each register.
+    kNumOutputsPerRegister,
+    // Maximum number of registers that we will use to hold outputs.
+    kMaxOutputRegisters,
+    // Number of 8 bit inputs in the inputs register.
+    kNumInputsPerRegister,
+    // Number of inputs in each weight group.
+    kNumInputsPerGroup};

-}  // namespace tesseract.
+} // namespace tesseract.
+
+#endif
--- a/src/arch/intsimdmatrixneon.cpp
+++ b/src/arch/intsimdmatrixneon.cpp
@ -0,0 +1,203 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsimdmatrixneon.cpp
+// Description: matrix-vector product for 8-bit data on neon.
+// Author:      Robin Watts (from the AVX2 original by Ray Smith)
+//
+// (C) Copyright 2017, Google Inc.
+// (C) Copyright 2020, Artifex Software Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if defined(__ARM_NEON)
+
+#  include "intsimdmatrix.h"
+
+#  include <algorithm>
+#  include <cstdint>
+#  include <vector>
+#  include "arm_neon.h"
+
+namespace tesseract {
+
+// Number of outputs held in each register. (Actually, we use a
+// pair of 4x32 registers, so 8 x 32 bit ints).
+constexpr int kNumOutputsPerRegister = 8;
+// Maximum number of registers that we will use.
+constexpr int kMaxOutputRegisters = 1;
+// Number of inputs in the inputs register.
+constexpr int kNumInputsPerRegister = 8;
+// Number of inputs in each weight group.
+constexpr int kNumInputsPerGroup = 8;
+
+// Function to compute part of a matrix.vector multiplication. The weights
+// are in a very specific order (see above) in w, which is multiplied by
+// u of length num_in, to produce output v after scaling the integer results
+// by the corresponding member of scales.
+// The amount of w and scales consumed is fixed and not available to the
+// caller.
+
+// Computes part of matrix.vector v = Wu. Computes N=8 results.
+// The weights *must* be arranged so that consecutive reads from wi
+// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
+// (kNumInputsPerGroup inputs))). After that there must be N consecutive
+// bias weights, before continuing with any more weights.
+// u must be padded out with zeros to
+// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
+static inline void PartialMatrixDotVector8(const int8_t *__restrict wi,
+                                           const double *__restrict scales,
+                                           const int8_t *__restrict u, int num_in,
+                                           double *__restrict v, int num_out) {
+  // Initialize all the results to 0.
+  int32x4_t result0123 = {0, 0, 0, 0};
+  int32x4_t result4567 = {0, 0, 0, 0};
+  int8x8_t bias_scale = {127, 127, 127, 127, 127, 127, 127, 127};
+  // Iterate over the input (u), one registerful at a time.
+  for (int j = 0; j < num_in; j += 8) {
+    int8x8_t vu = vld1_s8(u);              // vu     = u0  u1  u2  u3  u4  u5  u6  u7
+    int8x16_t vw01 = vld1q_s8(wi);         // vw0    = w00 w01 w02 w03 w04 w05 w06 w07
+                                           // w10 w11 w12 w13 w14 w15 w16 w17
+    int8x16_t vw23 = vld1q_s8(wi + 8 * 2); // vw2    = w20 w21 w22 w23 w24 w25 w26 w27 w30
+                                           // w31 w32 w33 w34 w35 w36 w37
+    int8x16_t vw45 = vld1q_s8(wi + 8 * 4); // vw4    = w40 w41 w42 w43 w44 w45 w46 w47 w50
+                                           // w51 w52 w53 w54 w55 w56 w57
+    int8x16_t vw67 = vld1q_s8(wi + 8 * 6); // vw6    = w60 w61 w62 w63 w64 w65 w66 w67 w70
+                                           // w71 w72 w73 w74 w75 w76 w77
+
+    int16x8_t vrow0q = vmull_s8(vget_low_s8(vw01), vu); // vrow0q = vw00.u0 w01.u1 w02.u2
+                                                        // w03.u3 vw04.u4 w05.u5 w06.u6 w07.u7
+    int16x8_t vrow1q = vmull_s8(vget_high_s8(vw01),
+                                vu);                    // vrow1q = vw10.u0 w11.u1 w12.u2 w13.u3
+                                                        // vw14.u4 w15.u5 w16.u6 w17.u7
+    int16x8_t vrow2q = vmull_s8(vget_low_s8(vw23), vu); // vrow2q = vw20.u0 w21.u1 w22.u2
+                                                        // w23.u3 vw24.u4 w25.u5 w26.u6 w27.u7
+    int16x8_t vrow3q = vmull_s8(vget_high_s8(vw23),
+                                vu);                    // vrow3q = vw30.u0 w31.u1 w32.u2 w33.u3
+                                                        // vw34.u4 w35.u5 w36.u6 w37.u7
+    int16x8_t vrow4q = vmull_s8(vget_low_s8(vw45), vu); // vrow4q = vw40.u0 w41.u1 w42.u2
+                                                        // w43.u3 vw44.u4 w45.u5 w46.u6 w47.u7
+    int16x8_t vrow5q = vmull_s8(vget_high_s8(vw45),
+                                vu);                    // vrow5q = vw50.u0 w51.u1 w52.u2 w53.u3
+                                                        // vw54.u4 w55.u5 w56.u6 w57.u7
+    int16x8_t vrow6q = vmull_s8(vget_low_s8(vw67), vu); // vrow6q = vw60.u0 w61.u1 w62.u2
+                                                        // w63.u3 vw64.u4 w65.u5 w66.u6 w67.u7
+    int16x8_t vrow7q = vmull_s8(vget_high_s8(vw67),
+                                vu); // vrow7q = vw70.u0 w71.u1 w72.u2 w73.u3
+                                     // vw74.u4 w75.u5 w76.u6 w77.u7
+
+    int32x4_t vrow0q2 = vpaddlq_s16(vrow0q); // vrow0q2 = vw00.u0+w01.u1 w02.u2+w03.u3
+                                             // vw04.u4+w05.u5 w06.u6+w07.u7
+    int32x4_t vrow1q2 = vpaddlq_s16(vrow1q); // vrow1q2 = vw10.u0+w11.u1 w12.u2+w13.u3
+                                             // vw14.u4+w15.u5 w16.u6+w17.u7
+    int32x4_t vrow2q2 = vpaddlq_s16(vrow2q); // vrow2q2 = vw20.u0+w21.u1 w22.u2+w23.u3
+                                             // vw24.u4+w25.u5 w26.u6+w27.u7
+    int32x4_t vrow3q2 = vpaddlq_s16(vrow3q); // vrow3q2 = vw30.u0+w31.u1 w32.u2+w33.u3
+                                             // vw34.u4+w35.u5 w36.u6+w37.u7
+    int32x4_t vrow4q2 = vpaddlq_s16(vrow4q); // vrow4q2 = vw40.u0+w41.u1 w42.u2+w43.u3
+                                             // vw44.u4+w45.u5 w46.u6+w47.u7
+    int32x4_t vrow5q2 = vpaddlq_s16(vrow5q); // vrow5q2 = vw50.u0+w51.u1 w52.u2+w53.u3
+                                             // vw54.u4+w55.u5 w56.u6+w57.u7
+    int32x4_t vrow6q2 = vpaddlq_s16(vrow6q); // vrow6q2 = vw60.u0+w61.u1 w62.u2+w63.u3
+                                             // vw64.u4+w65.u5 w66.u6+w67.u7
+    int32x4_t vrow7q2 = vpaddlq_s16(vrow7q); // vrow7q2 = vw70.u0+w71.u1 w72.u2+w73.u3
+                                             // vw74.u4+w75.u5 w76.u6+w77.u7
+
+    vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
+                           vpadd_s32(vget_low_s32(vrow1q2), vget_high_s32(vrow1q2)));
+    // vrow0q2 = vw00.u0+...+w03.u3 vw04.u4+...+w07.u7 vw10.u0+...+w13.u3
+    // vw14.u4+...+w17.u7
+    vrow2q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)),
+                           vpadd_s32(vget_low_s32(vrow3q2), vget_high_s32(vrow3q2)));
+    // vrow0q2 = vw20.u0+...+w23.u3 vw24.u4+...+w27.u7 vw30.u0+...+w33.u3
+    // vw34.u4+...+w37.u7
+    vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
+                           vpadd_s32(vget_low_s32(vrow5q2), vget_high_s32(vrow5q2)));
+    // vrow0q2 = vw40.u0+...+w43.u3 vw44.u4+...+w47.u7 vw50.u0+...+w53.u3
+    // vw54.u4+...+w57.u7
+    vrow6q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)),
+                           vpadd_s32(vget_low_s32(vrow7q2), vget_high_s32(vrow7q2)));
+    // vrow0q2 = vw60.u0+...+w63.u3 vw64.u4+...+w67.u7 vw70.u0+...+w73.u3
+    // vw74.u4+...+w77.u7
+
+    vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
+                           vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)));
+    // vrow0q2 = vw00.u0+...+w07.u7 vw10.u0+...+w17.u7 vw20.u0+...+w27.u7
+    // vw30.u0+...+w37.u7
+    vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
+                           vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)));
+    // vrow0q2 = vw40.u0+...+w47.u7 vw50.u0+...+w57.u7 vw60.u0+...+w67.u7
+    // vw70.u0+...+w77.u7
+
+    result0123 = vaddq_s32(result0123, vrow0q2);
+    result4567 = vaddq_s32(result4567, vrow4q2);
+    u += 8;
+    wi += 64;
+  }
+  {
+    int8x8_t bias = vld1_s8(wi); // vw0    = b0  b1  b2  b3  b4  b5  b6  b7
+    int16x8_t scaled_bias = vmull_s8(bias, bias_scale);
+    result0123 = vaddw_s16(result0123, vget_low_s16(scaled_bias));
+    result4567 = vaddw_s16(result4567, vget_high_s16(scaled_bias));
+    *v++ = vget_lane_s32(vget_low_s32(result0123), 0) * *scales++;
+    if (num_out > 1)
+      *v++ = vget_lane_s32(vget_low_s32(result0123), 1) * *scales++;
+    if (num_out > 2)
+      *v++ = vget_lane_s32(vget_high_s32(result0123), 0) * *scales++;
+    if (num_out > 3)
+      *v++ = vget_lane_s32(vget_high_s32(result0123), 1) * *scales++;
+    if (num_out > 4)
+      *v++ = vget_lane_s32(vget_low_s32(result4567), 0) * *scales++;
+    if (num_out > 5)
+      *v++ = vget_lane_s32(vget_low_s32(result4567), 1) * *scales++;
+    if (num_out > 6)
+      *v++ = vget_lane_s32(vget_high_s32(result4567), 0) * *scales++;
+    if (num_out > 7)
+      *v = vget_lane_s32(vget_high_s32(result4567), 1) * *scales;
+  }
+}
+
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
+                            const int8_t *u, double *v) {
+  const int num_out = dim1;
+  const int num_in = dim2 - 1;
+  // Each call to a partial_func_ produces group_size outputs, except the
+  // last one, which can produce less.
+  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
+  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
+  int output = 0;
+
+  int w_step = (rounded_num_in + 1) * group_size;
+
+  for (; output + group_size <= num_out; output += group_size) {
+    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v, kNumOutputsPerRegister);
+    wi += w_step;
+    scales += group_size;
+    v += group_size;
+  }
+  if (output < num_out)
+    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v,
+                            num_out & (kNumOutputsPerRegister - 1));
+}
+
+const IntSimdMatrix IntSimdMatrix::intSimdMatrixNEON = {
+    // Function.
+    matrixDotVector,
+    // Number of 32 bit outputs held in each register.
+    kNumOutputsPerRegister,
+    // Maximum number of registers that we will use to hold outputs.
+    kMaxOutputRegisters,
+    // Number of 8 bit inputs in the inputs register.
+    kNumInputsPerRegister,
+    // Number of inputs in each weight group.
+    kNumInputsPerGroup};
+
+} // namespace tesseract.
+
+#endif /* __ARM_NEON */
--- a/src/arch/intsimdmatrixsse.cpp
+++ b/src/arch/intsimdmatrixsse.cpp
@ -16,20 +16,22 @@
 ///////////////////////////////////////////////////////////////////////

 #if !defined(__SSE4_1__)
-#error Implementation only for SSE 4.1 capable architectures
-#endif
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for SSE 4.1 capable architectures
+#  endif
+#else

-#include "intsimdmatrix.h"
+#  include "intsimdmatrix.h"

-#include <cstdint>
-#include <emmintrin.h>
-#include <smmintrin.h>
+#  include <emmintrin.h>
+#  include <smmintrin.h>
+#  include <cstdint>

 namespace tesseract {

 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
+static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) {
  int max_offset = n - 8;
  int offset = 0;
  // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
@ -37,8 +39,8 @@ static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
  int32_t result = 0;
  if (offset <= max_offset) {
    offset = 8;
-    __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
-    __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
+    __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u));
+    __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v));
    __m128i sum = _mm_cvtepi8_epi16(packed1);
    packed2 = _mm_cvtepi8_epi16(packed2);
    // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
@ -46,8 +48,8 @@ static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
    // to make 4 32 bit results that still fit in a 128 bit register.
    sum = _mm_madd_epi16(sum, packed2);
    while (offset <= max_offset) {
-      packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
-      packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
+      packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u + offset));
+      packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v + offset));
      offset += 8;
      packed1 = _mm_cvtepi8_epi16(packed1);
      packed2 = _mm_cvtepi8_epi16(packed2);
@ -67,16 +69,15 @@ static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
 }

 // Computes part of matrix.vector v = Wu. Computes 1 result.
-static void PartialMatrixDotVector1(const int8_t* wi, const double* scales,
-                                    const int8_t* u, int num_in,
-                                    double* v) {
+static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u,
+                                    int num_in, double *v) {
  double total = IntDotProductSSE(u, wi, num_in);
  // Add in the bias and correct for integer values.
-  *v = (total / INT8_MAX + wi[num_in]) * *scales;
+  *v = (total + wi[num_in] * INT8_MAX) * *scales;
 }

-static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
-                            const double* scales, const int8_t* u, double* v) {
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
+                            const int8_t *u, double *v) {
  const int num_out = dim1;
  const int num_in = dim2 - 1;
  int output = 0;
@ -90,15 +91,16 @@ static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
 }

 const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {
-  matrixDotVector,
-  // Number of 32 bit outputs held in each register.
-  1,
-  // Maximum number of registers that we will use to hold outputs.
-  1,
-  // Number of 8 bit inputs in the inputs register.
-  1,
-  // Number of inputs in each weight group.
-  1
-};
+    matrixDotVector,
+    // Number of 32 bit outputs held in each register.
+    1,
+    // Maximum number of registers that we will use to hold outputs.
+    1,
+    // Number of 8 bit inputs in the inputs register.
+    1,
+    // Number of inputs in each weight group.
+    1};

-}  // namespace tesseract.
+} // namespace tesseract.
+
+#endif
--- a/src/arch/simddetect.cpp
+++ b/src/arch/simddetect.cpp
@ -15,24 +15,36 @@
 // limitations under the License.
 ///////////////////////////////////////////////////////////////////////

-#include "config_auto.h"     // for HAVE_AVX, ...
-#include <numeric>           // for std::inner_product
-#include "simddetect.h"
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h" // for HAVE_AVX, ...
+#endif
+#include <numeric> // for std::inner_product
 #include "dotproduct.h"
-#include "intsimdmatrix.h"   // for IntSimdMatrix
-#include "params.h"   // for STRING_VAR
-#include "tprintf.h"  // for tprintf
+#include "intsimdmatrix.h" // for IntSimdMatrix
+#include "params.h"        // for STRING_VAR
+#include "simddetect.h"
+#include "tprintf.h" // for tprintf

 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
-# define HAS_CPUID
+#  define HAS_CPUID
 #endif

 #if defined(HAS_CPUID)
-#if defined(__GNUC__)
-# include <cpuid.h>
-#elif defined(_WIN32)
-# include <intrin.h>
+#  if defined(__GNUC__)
+#    include <cpuid.h>
+#  elif defined(_WIN32)
+#    include <intrin.h>
+#  endif
 #endif
+
+#if defined(HAVE_NEON) && !defined(__aarch64__)
+#  ifdef ANDROID
+#    include <cpu-features.h>
+#  else
+/* Assume linux */
+#    include <asm/hwcap.h>
+#    include <sys/auxv.h>
+#  endif
 #endif

 namespace tesseract {
@ -49,11 +61,17 @@ namespace tesseract {
 // in AVX registers.
 DotProductFunction DotProduct;

-static STRING_VAR(dotproduct, "auto",
-                  "Function used for calculation of dot product");
+static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");

 SIMDDetect SIMDDetect::detector;

+#if defined(__aarch64__)
+// ARMv8 always has NEON.
+bool SIMDDetect::neon_available_ = true;
+#elif defined(HAVE_NEON)
+// If true, then Neon has been detected.
+bool SIMDDetect::neon_available_;
+#else
 // If true, then AVX has been detected.
 bool SIMDDetect::avx_available_;
 bool SIMDDetect::avx2_available_;
@ -63,20 +81,22 @@ bool SIMDDetect::avx512BW_available_;
 bool SIMDDetect::fma_available_;
 // If true, then SSe4.1 has been detected.
 bool SIMDDetect::sse_available_;
+#endif

 // Computes and returns the dot product of the two n-vectors u and v.
-static double DotProductGeneric(const double* u, const double* v, int n) {
+static double DotProductGeneric(const double *u, const double *v, int n) {
  double total = 0.0;
-  for (int k = 0; k < n; ++k) total += u[k] * v[k];
+  for (int k = 0; k < n; ++k)
+    total += u[k] * v[k];
  return total;
 }

 // Compute dot product using std::inner_product.
-static double DotProductStdInnerProduct(const double* u, const double* v, int n) {
+static double DotProductStdInnerProduct(const double *u, const double *v, int n) {
  return std::inner_product(u, u + n, v, 0.0);
 }

-static void SetDotProduct(DotProductFunction f, const IntSimdMatrix* m = nullptr) {
+static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
  DotProduct = f;
  IntSimdMatrix::intSimdMatrix = m;
 }
@ -91,29 +111,39 @@ SIMDDetect::SIMDDetect() {
  SetDotProduct(DotProductGeneric);

 #if defined(HAS_CPUID)
-#if defined(__GNUC__)
+#  if defined(__GNUC__)
  unsigned int eax, ebx, ecx, edx;
  if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
    // Note that these tests all use hex because the older compilers don't have
    // the newer flags.
-#if defined(HAVE_SSE4_1)
+#    if defined(HAVE_SSE4_1)
    sse_available_ = (ecx & 0x00080000) != 0;
-#endif
-#if defined(HAVE_FMA)
-    fma_available_ = (ecx & 0x00001000) != 0;
-#endif
-#if defined(HAVE_AVX)
-    avx_available_ = (ecx & 0x10000000) != 0;
-    if (avx_available_) {
-      // There is supposed to be a __get_cpuid_count function, but this is all
-      // there is in my cpuid.h. It is a macro for an asm statement and cannot
-      // be used inside an if.
-      __cpuid_count(7, 0, eax, ebx, ecx, edx);
-      avx2_available_ = (ebx & 0x00000020) != 0;
-      avx512F_available_ = (ebx & 0x00010000) != 0;
-      avx512BW_available_ = (ebx & 0x40000000) != 0;
+#    endif
+#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
+    auto xgetbv = []() {
+      uint32_t xcr0;
+      __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
+      return xcr0;
+    };
+    if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
+      // OSXSAVE bit is set, XMM state and YMM state are fine.
+#      if defined(HAVE_FMA)
+      fma_available_ = (ecx & 0x00001000) != 0;
+#      endif
+#      if defined(HAVE_AVX)
+      avx_available_ = (ecx & 0x10000000) != 0;
+      if (avx_available_) {
+        // There is supposed to be a __get_cpuid_count function, but this is all
+        // there is in my cpuid.h. It is a macro for an asm statement and cannot
+        // be used inside an if.
+        __cpuid_count(7, 0, eax, ebx, ecx, edx);
+        avx2_available_ = (ebx & 0x00000020) != 0;
+        avx512F_available_ = (ebx & 0x00010000) != 0;
+        avx512BW_available_ = (ebx & 0x40000000) != 0;
+      }
+#      endif
    }
-#endif
+#    endif
  }
 #  elif defined(_WIN32)
  int cpuInfo[4];
@ -122,32 +152,45 @@ SIMDDetect::SIMDDetect() {
  max_function_id = cpuInfo[0];
  if (max_function_id >= 1) {
    __cpuid(cpuInfo, 1);
-#if defined(HAVE_SSE4_1)
+#    if defined(HAVE_SSE4_1)
    sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
-#endif
-#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
+#    endif
+#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
    if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
      // OSXSAVE bit is set, XMM state and YMM state are fine.
-#if defined(HAVE_FMA)
+#      if defined(HAVE_FMA)
      fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
-#endif
-#if defined(HAVE_AVX)
+#      endif
+#      if defined(HAVE_AVX)
      avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
-#endif
-#if defined(HAVE_AVX2)
+#      endif
+#      if defined(HAVE_AVX2)
      if (max_function_id >= 7) {
        __cpuid(cpuInfo, 7);
        avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
        avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
        avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
      }
-#endif
+#      endif
    }
-#endif
+#    endif
  }
-#else
-#error "I don't know how to test for SIMD with this compiler"
+#  else
+#    error "I don't know how to test for SIMD with this compiler"
+#  endif
 #endif
+
+#if defined(HAVE_NEON) && !defined(__aarch64__)
+#  ifdef ANDROID
+  {
+    AndroidCpuFamily family = android_getCpuFamily();
+    if (family == ANDROID_CPU_FAMILY_ARM)
+      neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
+  }
+#  else
+  /* Assume linux */
+  neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
+#  endif
 #endif

  // Select code for calculation of dot product based on autodetection.
@ -167,6 +210,11 @@ SIMDDetect::SIMDDetect() {
  } else if (sse_available_) {
    // SSE detected.
    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
+#endif
+#if defined(HAVE_NEON) || defined(__aarch64__)
+  } else if (neon_available_) {
+    // NEON detected.
+    SetDotProduct(DotProduct, &IntSimdMatrix::intSimdMatrixNEON);
 #endif
  }
 }
@ -174,7 +222,7 @@ SIMDDetect::SIMDDetect() {
 void SIMDDetect::Update() {
  // Select code for calculation of dot product based on the
  // value of the config variable if that value is not empty.
-  const char* dotproduct_method = "generic";
+  const char *dotproduct_method = "generic";
  if (!strcmp(dotproduct.c_str(), "auto")) {
    // Automatic detection. Nothing to be done.
  } else if (!strcmp(dotproduct.c_str(), "generic")) {
@ -217,17 +265,18 @@ void SIMDDetect::Update() {
    // Unsupported value of config variable.
    tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
            dotproduct.c_str());
-    tprintf("Support values for dotproduct: auto generic native"
+    tprintf(
+        "Support values for dotproduct: auto generic native"
 #if defined(HAVE_AVX)
-            " avx"
+        " avx"
 #endif
 #if defined(HAVE_SSE4_1)
-            " sse"
+        " sse"
 #endif
-            " std::inner_product.\n");
+        " std::inner_product.\n");
  }

  dotproduct.set_value(dotproduct_method);
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/arch/simddetect.h
+++ b/src/arch/simddetect.h
@ -17,19 +17,19 @@
 #ifndef TESSERACT_ARCH_SIMDDETECT_H_
 #define TESSERACT_ARCH_SIMDDETECT_H_

-#include <tesseract/platform.h>
+#include <tesseract/export.h>

 namespace tesseract {

 // Function pointer for best calculation of dot product.
-using DotProductFunction = double (*)(const double*, const double*, int);
+using DotProductFunction = double (*)(const double *, const double *, int);
 extern DotProductFunction DotProduct;

 // Architecture detector. Add code here to detect any other architectures for
 // SIMD-based faster dot product functions. Intended to be a single static
 // object, but it does no real harm to have more than one.
 class SIMDDetect {
- public:
+public:
  // Returns true if AVX is available on this system.
  static inline bool IsAVXAvailable() {
    return detector.avx_available_;
@ -54,15 +54,19 @@ class SIMDDetect {
  static inline bool IsSSEAvailable() {
    return detector.sse_available_;
  }
+  // Returns true if NEON is available on this system.
+  static inline bool IsNEONAvailable() {
+    return detector.neon_available_;
+  }

  // Update settings after config variable was set.
  static TESS_API void Update();

- private:
+private:
  // Constructor, must set all static member variables.
  SIMDDetect();

- private:
+private:
  // Singleton.
  static SIMDDetect detector;
  // If true, then AVX has been detected.
@ -74,8 +78,10 @@ class SIMDDetect {
  static TESS_API bool fma_available_;
  // If true, then SSe4.1 has been detected.
  static TESS_API bool sse_available_;
+  // If true, then NEON has been detected.
+  static TESS_API bool neon_available_;
 };

-}  // namespace tesseract
+} // namespace tesseract

-#endif  // TESSERACT_ARCH_SIMDDETECT_H_
+#endif // TESSERACT_ARCH_SIMDDETECT_H_
--- a/src/ccmain/adaptions.cpp
+++ b/src/ccmain/adaptions.cpp
@ -19,32 +19,30 @@

 #include <cctype>
 #include <cstring>
-#include "tessvars.h"
-#include "reject.h"
 #include "control.h"
+#include "reject.h"
 #include "stopper.h"
 #include "tesseractclass.h"
+#include "tessvars.h"

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
-#include "config_auto.h"
+#  include "config_auto.h"
 #endif

 namespace tesseract {
-bool Tesseract::word_adaptable(  //should we adapt?
-        WERD_RES* word,
-        uint16_t mode) {
+bool Tesseract::word_adaptable( // should we adapt?
+    WERD_RES *word, uint16_t mode) {
  if (tessedit_adaption_debug) {
    tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
-          word->best_choice->unichar_string().c_str(),
-          word->best_choice->rating(), word->best_choice->certainty());
+            word->best_choice->unichar_string().c_str(), word->best_choice->rating(),
+            word->best_choice->certainty());
  }

  bool status = false;
  BITS16 flags(mode);

-  enum MODES
-  {
+  enum MODES {
    ADAPTABLE_WERD,
    ACCEPTABLE_WERD,
    CHECK_DAWGS,
@ -54,54 +52,57 @@ bool Tesseract::word_adaptable(  //should we adapt?
  };

  /*
-  0: NO adaption
-  */
+0: NO adaption
+*/
  if (mode == 0) {
-    if (tessedit_adaption_debug) tprintf("adaption disabled\n");
+    if (tessedit_adaption_debug)
+      tprintf("adaption disabled\n");
    return false;
  }

-  if (flags.bit (ADAPTABLE_WERD)) {
-    status |= word->tess_would_adapt;  // result of Classify::AdaptableWord()
+  if (flags[ADAPTABLE_WERD]) {
+    status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
    if (tessedit_adaption_debug && !status) {
      tprintf("tess_would_adapt bit is false\n");
    }
  }

-  if (flags.bit (ACCEPTABLE_WERD)) {
+  if (flags[ACCEPTABLE_WERD]) {
    status |= word->tess_accepted;
    if (tessedit_adaption_debug && !status) {
      tprintf("tess_accepted bit is false\n");
    }
  }

-  if (!status) {                  // If not set then
-    return false;                // ignore other checks
+  if (!status) {  // If not set then
+    return false; // ignore other checks
  }

-  if (flags.bit (CHECK_DAWGS) &&
-    (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
-    (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
-    (word->best_choice->permuter () != USER_DAWG_PERM) &&
-    (word->best_choice->permuter () != NUMBER_PERM)) {
-    if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
+  if (flags[CHECK_DAWGS] && (word->best_choice->permuter() != SYSTEM_DAWG_PERM) &&
+      (word->best_choice->permuter() != FREQ_DAWG_PERM) &&
+      (word->best_choice->permuter() != USER_DAWG_PERM) &&
+      (word->best_choice->permuter() != NUMBER_PERM)) {
+    if (tessedit_adaption_debug)
+      tprintf("word not in dawgs\n");
    return false;
  }

-  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, false)) {
-    if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
+  if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict(word, false)) {
+    if (tessedit_adaption_debug)
+      tprintf("word has ell conflict\n");
    return false;
  }

-  if (flags.bit (CHECK_SPACES) &&
-    (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
-    if (tessedit_adaption_debug) tprintf("word contains spaces\n");
+  if (flags[CHECK_SPACES] &&
+      (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
+    if (tessedit_adaption_debug)
+      tprintf("word contains spaces\n");
    return false;
  }

-  if (flags.bit (CHECK_AMBIG_WERD) &&
-      word->best_choice->dangerous_ambig_found()) {
-    if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
+  if (flags[CHECK_AMBIG_WERD] && word->best_choice->dangerous_ambig_found()) {
+    if (tessedit_adaption_debug)
+      tprintf("word is ambiguous\n");
    return false;
  }

@ -111,4 +112,4 @@ bool Tesseract::word_adaptable(  //should we adapt?
  return status;
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/ccmain/applybox.cpp
+++ b/src/ccmain/applybox.cpp
@ -16,22 +16,26 @@
 *
 **********************************************************************/

-#include <cctype>
-#include <cerrno>
-#include <cstring>
-#include "allheaders.h"
-#include "boxread.h"
-#include "pageres.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include <allheaders.h>
+#  include <cctype>
+#  include <cerrno>
+#  include <cstring>
+#  include "boxread.h"
+#endif // ndef DISABLED_LEGACY_ENGINE
 #include <tesseract/unichar.h>
-#include "unicharset.h"
+#include "genericvector.h"
+#include "pageres.h"
 #include "tesseractclass.h"
-#include <tesseract/genericvector.h>
+#include "unicharset.h"

+#ifndef DISABLED_LEGACY_ENGINE
 /** Max number of blobs to classify together in FindSegmentation. */
 const int kMaxGroupSize = 4;
 /// Max fraction of median allowed as deviation in xheight before switching
 /// to median.
 const double kMaxXHeightDeviationFraction = 0.125;
+#endif // ndef DISABLED_LEGACY_ENGINE

 /**
 * The box file is assumed to contain box definitions, one per line, of the
@ -73,20 +77,18 @@ namespace tesseract {
 #ifndef DISABLED_LEGACY_ENGINE
 static void clear_any_old_text(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
-  for (block_it.mark_cycle_pt();
-       !block_it.cycled_list(); block_it.forward()) {
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    ROW_IT row_it(block_it.data()->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      WERD_IT word_it(row_it.data()->word_list());
-      for (word_it.mark_cycle_pt();
-           !word_it.cycled_list(); word_it.forward()) {
+      for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
        word_it.data()->set_text("");
      }
    }
  }
 }

-// Applies the box file based on the image name fname, and resegments
+// Applies the box file based on the image name filename, and resegments
 // the words in the block_list (page), with:
 // blob-mode: one blob per line in the box file, words as input.
 // word/line-mode: one blob per space-delimited unit after the #, and one word
@ -106,14 +108,12 @@ static void clear_any_old_text(BLOCK_LIST *block_list) {
 // Instead, the correct_text member of WERD_RES is set, and this may be later
 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
 // is not required before calling ApplyBoxTraining.
-PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
-                                bool find_segmentation,
+PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
                                BLOCK_LIST *block_list) {
-  GenericVector<TBOX> boxes;
-  GenericVector<STRING> texts, full_texts;
-  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
-                    nullptr)) {
-    return nullptr;  // Can't do it.
+  std::vector<TBOX> boxes;
+  std::vector<std::string> texts, full_texts;
+  if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
+    return nullptr; // Can't do it.
  }

  const int box_count = boxes.size();
@ -121,27 +121,22 @@ PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,

  // In word mode, we use the boxes to make a word for each box, but
  // in blob mode we use the existing words and maximally chop them first.
-  PAGE_RES* page_res = find_segmentation ?
-      nullptr : SetupApplyBoxes(boxes, block_list);
+  PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
  clear_any_old_text(block_list);

  for (int i = 0; i < box_count; i++) {
    bool foundit = false;
    if (page_res != nullptr) {
-      foundit = ResegmentCharBox(page_res,
-                                 (i == 0) ? nullptr : &boxes[i - 1],
-                                 boxes[i],
-                                 (i == box_count - 1) ? nullptr : &boxes[i + 1],
-                                 full_texts[i].c_str());
+      foundit =
+          ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
+                           (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
    } else {
      foundit = ResegmentWordBox(block_list, boxes[i],
-                                 (i == box_count - 1) ? nullptr : &boxes[i + 1],
-                                 texts[i].c_str());
+                                 (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
    }
    if (!foundit) {
      box_failures++;
-      ReportFailedBox(i, boxes[i], texts[i].c_str(),
-                      "FAILURE! Couldn't find a matching blob");
+      ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
    }
  }

@ -160,14 +155,12 @@ PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
  TidyUp(page_res);
  return page_res;
 }
-#endif  // ndef DISABLED_LEGACY_ENGINE

 // Helper computes median xheight in the image.
 static double MedianXHeight(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
-  for (block_it.mark_cycle_pt();
-       !block_it.cycled_list(); block_it.forward()) {
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    ROW_IT row_it(block_it.data()->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
@ -184,15 +177,14 @@ void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
  // Strip all fuzzy space markers to simplify the PAGE_RES.
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
-    BLOCK* block = b_it.data();
+    BLOCK *block = b_it.data();
    ROW_IT r_it(block->row_list());
-    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
-      ROW* row = r_it.data();
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+      ROW *row = r_it.data();
      const double diff = fabs(row->x_height() - median_xheight);
      if (diff > max_deviation) {
        if (applybox_debug) {
-          tprintf("row xheight=%g, but median xheight = %g\n",
-                  row->x_height(), median_xheight);
+          tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
        }
        row->set_x_height(static_cast<float>(median_xheight));
      }
@ -200,23 +192,20 @@ void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
  }
 }

-#ifndef DISABLED_LEGACY_ENGINE
-
 /// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
 /// All fuzzy spaces are removed, and all the words are maximally chopped.
-PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
-                                     BLOCK_LIST *block_list) {
+PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
  PreenXHeights(block_list);
  // Strip all fuzzy space markers to simplify the PAGE_RES.
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
-    BLOCK* block = b_it.data();
+    BLOCK *block = b_it.data();
    ROW_IT r_it(block->row_list());
-    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
-      ROW* row = r_it.data();
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+      ROW *row = r_it.data();
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
-        WERD* word = w_it.data();
+        WERD *word = w_it.data();
        if (word->cblob_list()->empty()) {
          delete w_it.extract();
        } else {
@ -226,12 +215,11 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
      }
    }
  }
-  auto* page_res = new PAGE_RES(false, block_list, nullptr);
+  auto *page_res = new PAGE_RES(false, block_list, nullptr);
  PAGE_RES_IT pr_it(page_res);
-  WERD_RES* word_res;
+  WERD_RES *word_res;
  while ((word_res = pr_it.word()) != nullptr) {
-    MaximallyChopWord(boxes, pr_it.block()->block,
-                      pr_it.row()->row, word_res);
+    MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
    pr_it.forward();
  }
  return page_res;
@ -240,15 +228,11 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
 /// Tests the chopper by exhaustively running chop_one_blob.
 /// The word_res will contain filled chopped_word, seam_array, denorm,
 /// box_word and best_state for the maximally chopped word.
-void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
-                                  BLOCK* block, ROW* row,
-                                  WERD_RES* word_res) {
-  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
-                                     tessedit_ocr_engine_mode, nullptr,
-                                     classify_bln_numeric_mode,
-                                     textord_use_cjk_fp_model,
-                                     poly_allow_detailed_fx,
-                                     row, block)) {
+void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
+                                  WERD_RES *word_res) {
+  if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
+                                     classify_bln_numeric_mode, textord_use_cjk_fp_model,
+                                     poly_allow_detailed_fx, row, block)) {
    word_res->CloneChoppedToRebuild();
    return;
  }
@ -256,7 +240,7 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
    tprintf("Maximally chopping word at:");
    word_res->word->bounding_box().print();
  }
-  GenericVector<BLOB_CHOICE*> blob_choices;
+  GenericVector<BLOB_CHOICE *> blob_choices;
  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
  auto rating = static_cast<float>(INT8_MAX);
  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
@ -268,28 +252,25 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
    // produced, however much chopping is required. The chops are thus only
    // limited by the ability of the chopper to find suitable chop points,
    // and not by the value of the certainties.
-    auto* choice =
-        new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
+    auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
    blob_choices.push_back(choice);
    rating -= 0.125f;
  }
-  const double e = exp(1.0);  // The base of natural logs.
+  const double e = exp(1.0); // The base of natural logs.
  int blob_number;
  int right_chop_index = 0;
  if (!assume_fixed_pitch_char_segment) {
    // We only chop if the language is not fixed pitch like CJK.
-    SEAM* seam = nullptr;
-    while ((seam = chop_one_blob(boxes, blob_choices, word_res,
-                                 &blob_number)) != nullptr) {
+    SEAM *seam = nullptr;
+    while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
      word_res->InsertSeam(blob_number, seam);
-      BLOB_CHOICE* left_choice = blob_choices[blob_number];
+      BLOB_CHOICE *left_choice = blob_choices[blob_number];
      rating = left_choice->rating() / e;
      left_choice->set_rating(rating);
      left_choice->set_certainty(-rating);
      // combine confidence w/ serial #
-      auto* right_choice = new BLOB_CHOICE(++right_chop_index,
-                                                  rating - 0.125f, -rating, -1,
-                                                  0.0f, 0.0f, 0.0f, BCC_FAKE);
+      auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
+                                           0.0f, 0.0f, BCC_FAKE);
      blob_choices.insert(right_choice, blob_number + 1);
    }
  }
@ -308,7 +289,7 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
 /// Given a box with area A, and a blob with area B, with overlap area C,
 /// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
 /// miss metric gets the blob.
-static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
+static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
  const int overlap_area = box1.intersection(box2).area();
  const int a = box1.area();
  const int b = box2.area();
@ -326,16 +307,14 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
 ///
 /// This means that occasionally, blobs may be incorrectly segmented if the
 /// chopper fails to find a suitable chop point.
-bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
-                                 const TBOX& box, const TBOX* next_box,
-                                 const char* correct_text) {
+bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
+                                 const TBOX *next_box, const char *correct_text) {
  if (applybox_debug > 1) {
    tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
  }
  PAGE_RES_IT page_res_it(page_res);
-  WERD_RES* word_res;
-  for (word_res = page_res_it.word(); word_res != nullptr;
-       word_res = page_res_it.forward()) {
+  WERD_RES *word_res;
+  for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
    if (!word_res->box_word->bounding_box().major_overlap(box))
      continue;
    if (applybox_debug > 1) {
@ -351,18 +330,18 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
        if (!blob_box.major_overlap(box))
          break;
        if (word_res->correct_text[i + blob_count].length() > 0)
-          break;  // Blob is claimed already.
+          break; // Blob is claimed already.
        if (next_box != nullptr) {
          const double current_box_miss_metric = BoxMissMetric(blob_box, box);
          const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
          if (applybox_debug > 2) {
            tprintf("Checking blob:");
            blob_box.print();
-            tprintf("Current miss metric = %g, next = %g\n",
-                    current_box_miss_metric, next_box_miss_metric);
+            tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
+                    next_box_miss_metric);
          }
          if (current_box_miss_metric > next_box_miss_metric)
-            break;  // Blob is a better match for next box.
+            break; // Blob is a better match for next box.
        }
        char_box += blob_box;
      }
@ -371,7 +350,7 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
          tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
        }
        if (!char_box.almost_equal(box, 3) &&
-            ((next_box != nullptr && box.x_gap(*next_box) < -3)||
+            ((next_box != nullptr && box.x_gap(*next_box) < -3) ||
             (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
          return false;
        }
@ -419,7 +398,7 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
  if (applybox_debug > 0) {
    tprintf("FAIL!\n");
  }
-  return false;  // Failure.
+  return false; // Failure.
 }

 /// Consume all source blobs that strongly overlap the given box,
@ -428,38 +407,36 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
 /// applying the blobs to box or next_box with the least non-overlap.
 /// @return false if the box was in error, which can only be caused by
 /// failing to find an overlapping blob for a box.
-bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
-                                 const TBOX& box, const TBOX* next_box,
-                                 const char* correct_text) {
+bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
+                                 const char *correct_text) {
  if (applybox_debug > 1) {
    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
  }
-  WERD* new_word = nullptr;
+  WERD *new_word = nullptr;
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
-    BLOCK* block = b_it.data();
+    BLOCK *block = b_it.data();
    if (!box.major_overlap(block->pdblk.bounding_box()))
      continue;
    ROW_IT r_it(block->row_list());
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
-      ROW* row = r_it.data();
+      ROW *row = r_it.data();
      if (!box.major_overlap(row->bounding_box()))
        continue;
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
-        WERD* word = w_it.data();
+        WERD *word = w_it.data();
        if (applybox_debug > 2) {
          tprintf("Checking word:");
          word->bounding_box().print();
        }
        if (word->text() != nullptr && word->text()[0] != '\0')
-          continue;  // Ignore words that are already done.
+          continue; // Ignore words that are already done.
        if (!box.major_overlap(word->bounding_box()))
          continue;
        C_BLOB_IT blob_it(word->cblob_list());
-        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
-             blob_it.forward()) {
-          C_BLOB* blob = blob_it.data();
+        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+          C_BLOB *blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (!blob_box.major_overlap(box))
            continue;
@ -469,11 +446,11 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
            if (applybox_debug > 2) {
              tprintf("Checking blob:");
              blob_box.print();
-              tprintf("Current miss metric = %g, next = %g\n",
-                      current_box_miss_metric, next_box_miss_metric);
+              tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
+                      next_box_miss_metric);
            }
            if (current_box_miss_metric > next_box_miss_metric)
-              continue;  // Blob is a better match for next box.
+              continue; // Blob is a better match for next box.
          }
          if (applybox_debug > 2) {
            tprintf("Blob match: blob:");
@ -497,44 +474,40 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
      }
    }
  }
-  if (new_word == nullptr && applybox_debug > 0) tprintf("FAIL!\n");
+  if (new_word == nullptr && applybox_debug > 0)
+    tprintf("FAIL!\n");
  return new_word != nullptr;
 }

 /// Resegments the words by running the classifier in an attempt to find the
 /// correct segmentation that produces the required string.
-void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
+void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
  PAGE_RES_IT pr_it(page_res);
-  WERD_RES* word_res;
+  WERD_RES *word_res;
  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
-    const WERD* word = word_res->word;
+    const WERD *word = word_res->word;
    if (word->text() == nullptr || word->text()[0] == '\0')
-      continue;  // Ignore words that have no text.
+      continue; // Ignore words that have no text.
    // Convert the correct text to a vector of UNICHAR_ID
    GenericVector<UNICHAR_ID> target_text;
    if (!ConvertStringToUnichars(word->text(), &target_text)) {
-      tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
-              word->text());
+      tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
      pr_it.DeleteCurrentWord();
      continue;
    }
    if (!FindSegmentation(target_text, word_res)) {
-      tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
-              word->text());
+      tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
      pr_it.DeleteCurrentWord();
      continue;
    }
  }
 }

-#endif  // ndef DISABLED_LEGACY_ENGINE
-
 /// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
 /// @return false if an invalid UNICHAR_ID is encountered.
-bool Tesseract::ConvertStringToUnichars(const char* utf8,
-                                        GenericVector<UNICHAR_ID>* class_ids) {
+bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids) {
  for (int step = 0; *utf8 != '\0'; utf8 += step) {
-    const char* next_space = strchr(utf8, ' ');
+    const char *next_space = strchr(utf8, ' ');
    if (next_space == nullptr)
      next_space = utf8 + strlen(utf8);
    step = next_space - utf8;
@ -549,26 +522,21 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
  return true;
 }

-#ifndef DISABLED_LEGACY_ENGINE
-
-
 /// Resegments the word to achieve the target_text from the classifier.
 /// Returns false if the re-segmentation fails.
 /// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
 /// applies a full search on the classifier results to find the best classified
 /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
 /// substitutions ARE used.
-bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
-                                 WERD_RES* word_res) {
+bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
  // Classify all required combinations of blobs and save results in choices.
  const int word_length = word_res->box_word->length();
-  auto* choices =
-      new GenericVector<BLOB_CHOICE_LIST*>[word_length];
+  auto *choices = new GenericVector<BLOB_CHOICE_LIST *>[word_length];
  for (int i = 0; i < word_length; ++i) {
    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
-      BLOB_CHOICE_LIST* match_result = classify_piece(
-          word_res->seam_array, i, i + j - 1, "Applybox",
-          word_res->chopped_word, word_res->blamer_bundle);
+      BLOB_CHOICE_LIST *match_result =
+          classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
+                         word_res->blamer_bundle);
      if (applybox_debug > 2) {
        tprintf("%d+%d:", i, j);
        print_ratings_list("Segment:", match_result, unicharset);
@ -582,17 +550,17 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
  word_res->best_state.clear();
  GenericVector<int> search_segmentation;
  float best_rating = 0.0f;
-  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
-                &search_segmentation, &best_rating, &word_res->best_state);
+  SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
+                &word_res->best_state);
  for (int i = 0; i < word_length; ++i)
    choices[i].delete_data_pointers();
-  delete [] choices;
+  delete[] choices;
  if (word_res->best_state.empty()) {
    // Build the original segmentation and if it is the same length as the
    // truth, assume it will do.
    int blob_count = 1;
    for (int s = 0; s < word_res->seam_array.size(); ++s) {
-      SEAM* seam = word_res->seam_array[s];
+      SEAM *seam = word_res->seam_array[s];
      if (!seam->HasAnySplits()) {
        word_res->best_state.push_back(blob_count);
        blob_count = 1;
@ -602,14 +570,13 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
    }
    word_res->best_state.push_back(blob_count);
    if (word_res->best_state.size() != target_text.size()) {
-      word_res->best_state.clear();  // No good. Original segmentation bad size.
+      word_res->best_state.clear(); // No good. Original segmentation bad size.
      return false;
    }
  }
  word_res->correct_text.clear();
  for (int i = 0; i < target_text.size(); ++i) {
-    word_res->correct_text.push_back(
-        STRING(unicharset.id_to_unichar(target_text[i])));
+    word_res->correct_text.push_back(unicharset.id_to_unichar(target_text[i]));
  }
  return true;
 }
@ -628,22 +595,18 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
 /// @param segmentation
 /// @param best_rating
 /// @param best_segmentation
-void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
-                              int choices_pos, int choices_length,
-                              const GenericVector<UNICHAR_ID>& target_text,
-                              int text_index,
-                              float rating, GenericVector<int>* segmentation,
-                              float* best_rating,
-                              GenericVector<int>* best_segmentation) {
-  const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
+void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
+                              int choices_length, const GenericVector<UNICHAR_ID> &target_text,
+                              int text_index, float rating, GenericVector<int> *segmentation,
+                              float *best_rating, GenericVector<int> *best_segmentation) {
+  const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
    // Rating of matching choice or worst choice if no match.
    float choice_rating = 0.0f;
    // Find the corresponding best BLOB_CHOICE.
    BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
-    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
-         choice_it.forward()) {
-      const BLOB_CHOICE* choice = choice_it.data();
+    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
+      const BLOB_CHOICE *choice = choice_it.data();
      choice_rating = choice->rating();
      UNICHAR_ID class_id = choice->unichar_id();
      if (class_id == target_text[text_index]) {
@ -652,8 +615,7 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
      // Search ambigs table.
      if (class_id < table.size() && table[class_id] != nullptr) {
        AmbigSpec_IT spec_it(table[class_id]);
-        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
-             spec_it.forward()) {
+        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
          const AmbigSpec *ambig_spec = spec_it.data();
          // We'll only do 1-1.
          if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
@ -661,14 +623,13 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
            break;
        }
        if (!spec_it.cycled_list())
-          break;  // Found an ambig.
+          break; // Found an ambig.
      }
    }
    if (choice_it.cycled_list())
-      continue;  // No match.
+      continue; // No match.
    segmentation->push_back(length);
-    if (choices_pos + length == choices_length &&
-        text_index + 1 == target_text.size()) {
+    if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
      // This is a complete match. If the rating is good record a new best.
      if (applybox_debug > 2) {
        tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
@ -679,19 +640,15 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
        *best_segmentation = *segmentation;
        *best_rating = rating + choice_rating;
      }
-    } else if (choices_pos + length < choices_length &&
-               text_index + 1 < target_text.size()) {
+    } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
      if (applybox_debug > 3) {
-        tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
-                target_text[text_index],
+        tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
                unicharset.id_to_unichar(target_text[text_index]),
-                choice_it.data()->unichar_id() == target_text[text_index]
-                     ? "Match" : "Ambig",
+                choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
                choices_pos, length);
      }
-      SearchForText(choices, choices_pos + length, choices_length, target_text,
-                    text_index + 1, rating + choice_rating, segmentation,
-                    best_rating, best_segmentation);
+      SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
+                    rating + choice_rating, segmentation, best_rating, best_segmentation);
      if (applybox_debug > 3) {
        tprintf("End recursion for %d=%s\n", target_text[text_index],
                unicharset.id_to_unichar(target_text[text_index]));
@ -705,17 +662,17 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
 /// - Deletes all unused or emptied words, counting the unused ones.
 /// - Resets W_BOL and W_EOL flags correctly.
 /// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
-void Tesseract::TidyUp(PAGE_RES* page_res) {
+void Tesseract::TidyUp(PAGE_RES *page_res) {
  int ok_blob_count = 0;
  int bad_blob_count = 0;
  int ok_word_count = 0;
  int unlabelled_words = 0;
  PAGE_RES_IT pr_it(page_res);
-  WERD_RES* word_res;
+  WERD_RES *word_res;
  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
    int ok_in_word = 0;
    int blob_count = word_res->correct_text.size();
-    auto* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
+    auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
    word_choice->set_permuter(TOP_CHOICE_PERM);
    for (int c = 0; c < blob_count; ++c) {
      if (word_res->correct_text[c].length() > 0) {
@ -725,8 +682,8 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
      // unichar_ids do not matter. Which is fortunate, since TidyUp()
      // can be called while training Tesseract, at the stage where
      // unicharset is not meaningful yet.
-      word_choice->append_unichar_id_space_allocated(
-          INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
+      word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
+                                                     1.0f, -1.0f);
    }
    if (ok_in_word > 0) {
      ok_blob_count += ok_in_word;
@ -754,40 +711,45 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
  if (applybox_debug > 0) {
    tprintf("   Found %d good blobs.\n", ok_blob_count);
    if (bad_blob_count > 0) {
-      tprintf("   Leaving %d unlabelled blobs in %d words.\n",
-              bad_blob_count, ok_word_count);
+      tprintf("   Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
    }
    if (unlabelled_words > 0)
      tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
  }
 }

-#endif  // ndef DISABLED_LEGACY_ENGINE
-
 /** Logs a bad box by line in the box file and box coords.*/
-void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
-                                const char *box_ch, const char *err_msg) {
-  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
-          boxfile_lineno + 1, box_ch,
+void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
+                                const char *err_msg) {
+  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
          box.left(), box.bottom(), box.right(), box.top(), err_msg);
 }

-/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
-void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
+/// Calls #LearnWord to extract features for labelled blobs within each word.
+/// Features are stored in an internal buffer.
+void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
  PAGE_RES_IT pr_it(page_res);
-  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
-       word_res = pr_it.forward()) {
-    auto* choice = new WERD_CHOICE(word_res->uch_set,
-                                          word_res->correct_text.size());
+  int word_count = 0;
+  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
+    LearnWord(fontname.c_str(), word_res);
+    ++word_count;
+  }
+  tprintf("Generated training data for %d words\n", word_count);
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
+void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
+  PAGE_RES_IT pr_it(page_res);
+  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
+    auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
    for (int i = 0; i < word_res->correct_text.size(); ++i) {
      // The part before the first space is the real ground truth, and the
      // rest is the bounding box location and page number.
-      GenericVector<STRING> tokens;
-      word_res->correct_text[i].split(' ', &tokens);
+      std::vector<std::string> tokens = split(word_res->correct_text[i], ' ');
      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
-      choice->append_unichar_id_space_allocated(char_id,
-                                                word_res->best_state[i],
-                                                0.0f, 0.0f);
+      choice->append_unichar_id_space_allocated(char_id, word_res->best_state[i], 0.0f, 0.0f);
    }
    word_res->ClearWordChoices();
    word_res->LogNewRawChoice(choice);
@ -795,22 +757,4 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
  }
 }

-#ifndef DISABLED_LEGACY_ENGINE
-
-
-/// Calls #LearnWord to extract features for labelled blobs within each word.
-/// Features are stored in an internal buffer.
-void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
-  PAGE_RES_IT pr_it(page_res);
-  int word_count = 0;
-  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
-       word_res = pr_it.forward()) {
-    LearnWord(fontname.c_str(), word_res);
-    ++word_count;
-  }
-  tprintf("Generated training data for %d words\n", word_count);
-}
-
-#endif  // ndef DISABLED_LEGACY_ENGINE
-
-}  // namespace tesseract
+} // namespace tesseract
--- a/src/ccmain/control.cpp
+++ b/src/ccmain/control.cpp
--- a/src/ccmain/control.h
+++ b/src/ccmain/control.h
@ -25,14 +25,13 @@
 #ifndef CONTROL_H
 #define CONTROL_H

-enum ACCEPTABLE_WERD_TYPE
-{
-  AC_UNACCEPTABLE,               ///< Unacceptable word
-  AC_LOWER_CASE,                 ///< ALL lower case
-  AC_UPPER_CASE,                 ///< ALL upper case
-  AC_INITIAL_CAP,                ///< ALL but initial lc
-  AC_LC_ABBREV,                  ///< a.b.c.
-  AC_UC_ABBREV                   ///< A.B.C.
+enum ACCEPTABLE_WERD_TYPE {
+  AC_UNACCEPTABLE, ///< Unacceptable word
+  AC_LOWER_CASE,   ///< ALL lower case
+  AC_UPPER_CASE,   ///< ALL upper case
+  AC_INITIAL_CAP,  ///< ALL but initial lc
+  AC_LC_ABBREV,    ///< a.b.c.
+  AC_UC_ABBREV     ///< A.B.C.
 };

 #endif
--- a/src/ccmain/docqual.cpp
+++ b/src/ccmain/docqual.cpp
@ -16,27 +16,27 @@
 *
 **********************************************************************/

-#include <cctype>
 #include "docqual.h"
+#include <cctype>
 #include "reject.h"
-#include "tessvars.h"
 #include "tesseractclass.h"
+#include "tessvars.h"

-namespace tesseract{
+namespace tesseract {

-static void countMatchingBlobs(int16_t& match_count, int /*index*/) {
+static void countMatchingBlobs(int16_t &match_count, int /*index*/) {
  ++match_count;
 }

-static void countAcceptedBlobs(WERD_RES* word, int16_t& match_count,
-                               int16_t& accepted_match_count, int index) {
+static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,
+                               int index) {
  if (word->reject_map[index].accepted()) {
    ++accepted_match_count;
  }
  ++match_count;
 }

-static void acceptIfGoodQuality(WERD_RES* word, int index) {
+static void acceptIfGoodQuality(WERD_RES *word, int index) {
  if (word->reject_map[index].accept_if_good_quality()) {
    word->reject_map[index].setrej_quality_accept();
  }
@ -48,14 +48,13 @@ static void acceptIfGoodQuality(WERD_RES* word, int index) {
 * ASSUME blobs in both initial word and box_word are in ascending order of
 * left hand blob edge.
 *************************************************************************/
-int16_t Tesseract::word_blob_quality(WERD_RES* word) {
+int16_t Tesseract::word_blob_quality(WERD_RES *word) {
  int16_t match_count = 0;
  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
      !word->rebuild_word->blobs.empty()) {
-    using namespace std::placeholders;  // for _1
-    word->bln_boxes->ProcessMatchedBlobs(
-        *word->rebuild_word,
-        std::bind(countMatchingBlobs, match_count, _1));
+    using namespace std::placeholders; // for _1
+    word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
+                                         std::bind(countMatchingBlobs, match_count, _1));
  }
  return match_count;
 }
@ -66,9 +65,8 @@ int16_t Tesseract::word_outline_errs(WERD_RES *word) {

  if (word->rebuild_word != nullptr) {
    for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
-      TBLOB* blob = word->rebuild_word->blobs[b];
-      err_count += count_outline_errs(word->best_choice->unichar_string()[i],
-                                      blob->NumOutlines());
+      TBLOB *blob = word->rebuild_word->blobs[b];
+      err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
      i++;
    }
  }
@ -80,17 +78,16 @@ int16_t Tesseract::word_outline_errs(WERD_RES *word) {
 * Combination of blob quality and outline quality - how many good chars are
 * there? - I.e chars which pass the blob AND outline tests.
 *************************************************************************/
-void Tesseract::word_char_quality(WERD_RES* word, int16_t* match_count,
-                                  int16_t* accepted_match_count) {
+void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,
+                                  int16_t *accepted_match_count) {
  *match_count = 0;
  *accepted_match_count = 0;
  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
      !word->rebuild_word->blobs.empty()) {
-    using namespace std::placeholders;  // for _1
+    using namespace std::placeholders; // for _1
    word->bln_boxes->ProcessMatchedBlobs(
        *word->rebuild_word,
-        std::bind(countAcceptedBlobs,
-                  word, *match_count, *accepted_match_count, _1));
+        std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
  }
 }

@ -98,29 +95,28 @@ void Tesseract::word_char_quality(WERD_RES* word, int16_t* match_count,
 * unrej_good_chs()
 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
 *************************************************************************/
-void Tesseract::unrej_good_chs(WERD_RES* word) {
+void Tesseract::unrej_good_chs(WERD_RES *word) {
  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
      word->rebuild_word->blobs.empty()) {
-    using namespace std::placeholders;  // for _1
-    word->bln_boxes->ProcessMatchedBlobs(
-      *word->rebuild_word, std::bind(acceptIfGoodQuality, word, _1));
+    using namespace std::placeholders; // for _1
+    word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
+                                         std::bind(acceptIfGoodQuality, word, _1));
  }
 }

 int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
  int expected_outline_count;

-  if (STRING (outlines_odd).contains (c))
-    return 0;  // Don't use this char
-  else if (STRING (outlines_2).contains (c))
+  if (outlines_odd.contains(c))
+    return 0; // Don't use this char
+  else if (outlines_2.contains(c))
    expected_outline_count = 2;
  else
    expected_outline_count = 1;
-  return abs (outline_count - expected_outline_count);
+  return abs(outline_count - expected_outline_count);
 }

-void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
-                                        bool good_quality_doc) {
+void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {
  if ((tessedit_good_quality_unrej && good_quality_doc))
    unrej_good_quality_words(page_res_it);
  doc_and_block_rejection(page_res_it, good_quality_doc);
@ -141,71 +137,65 @@ void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
 *      - CAN'T do it in a single pass without a bit of fiddling
 *    - keep it simple but inefficient
 *************************************************************************/
-void Tesseract::unrej_good_quality_words(  //unreject potential
-                                         PAGE_RES_IT &page_res_it) {
+void Tesseract::unrej_good_quality_words( // unreject potential
+    PAGE_RES_IT &page_res_it) {
  WERD_RES *word;
  ROW_RES *current_row;
  BLOCK_RES *current_block;
  int i;

-  page_res_it.restart_page ();
-  while (page_res_it.word () != nullptr) {
-    check_debug_pt (page_res_it.word (), 100);
+  page_res_it.restart_page();
+  while (page_res_it.word() != nullptr) {
+    check_debug_pt(page_res_it.word(), 100);
    if (bland_unrej) {
-      word = page_res_it.word ();
-      for (i = 0; i < word->reject_map.length (); i++) {
-        if (word->reject_map[i].accept_if_good_quality ())
-          word->reject_map[i].setrej_quality_accept ();
+      word = page_res_it.word();
+      for (i = 0; i < word->reject_map.length(); i++) {
+        if (word->reject_map[i].accept_if_good_quality())
+          word->reject_map[i].setrej_quality_accept();
      }
-      page_res_it.forward ();
-    }
-    else if ((page_res_it.row ()->char_count > 0) &&
-      ((page_res_it.row ()->rej_count /
-      static_cast<float>(page_res_it.row ()->char_count)) <=
-    quality_rowrej_pc)) {
-      word = page_res_it.word ();
+      page_res_it.forward();
+    } else if ((page_res_it.row()->char_count > 0) &&
+               ((page_res_it.row()->rej_count /
+                 static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
+      word = page_res_it.word();
      if (word->reject_map.quality_recoverable_rejects() &&
          (tessedit_unrej_any_wd ||
-           acceptable_word_string(*word->uch_set,
-                                  word->best_choice->unichar_string().c_str(),
-                                  word->best_choice->unichar_lengths().c_str())
-               != AC_UNACCEPTABLE)) {
+           acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
+                                  word->best_choice->unichar_lengths().c_str()) !=
+               AC_UNACCEPTABLE)) {
        unrej_good_chs(word);
      }
-      page_res_it.forward ();
-    }
-    else {
+      page_res_it.forward();
+    } else {
      // Skip to end of dodgy row.
-      current_row = page_res_it.row ();
-      while ((page_res_it.word () != nullptr) &&
-        (page_res_it.row () == current_row))
-        page_res_it.forward ();
+      current_row = page_res_it.row();
+      while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row))
+        page_res_it.forward();
    }
-    check_debug_pt (page_res_it.word (), 110);
+    check_debug_pt(page_res_it.word(), 110);
  }
-  page_res_it.restart_page ();
+  page_res_it.restart_page();
  page_res_it.page_res->char_count = 0;
  page_res_it.page_res->rej_count = 0;
  current_block = nullptr;
  current_row = nullptr;
-  while (page_res_it.word () != nullptr) {
-    if (current_block != page_res_it.block ()) {
-      current_block = page_res_it.block ();
+  while (page_res_it.word() != nullptr) {
+    if (current_block != page_res_it.block()) {
+      current_block = page_res_it.block();
      current_block->char_count = 0;
      current_block->rej_count = 0;
    }
-    if (current_row != page_res_it.row ()) {
-      current_row = page_res_it.row ();
+    if (current_row != page_res_it.row()) {
+      current_row = page_res_it.row();
      current_row->char_count = 0;
      current_row->rej_count = 0;
      current_row->whole_word_rej_count = 0;
    }
-    page_res_it.rej_stat_word ();
-    page_res_it.forward ();
+    page_res_it.rej_stat_word();
+    page_res_it.forward();
  }
 }

-
 /*************************************************************************
 * doc_and_block_rejection()
 *
@ -213,9 +203,8 @@ void Tesseract::unrej_good_quality_words(  //unreject potential
 * If any block has too many rejects - reject all words in the block
 *************************************************************************/

-void Tesseract::doc_and_block_rejection(  //reject big chunks
-                                        PAGE_RES_IT &page_res_it,
-                                        bool good_quality_doc) {
+void Tesseract::doc_and_block_rejection( // reject big chunks
+    PAGE_RES_IT &page_res_it, bool good_quality_doc) {
  int16_t block_no = 0;
  int16_t row_no = 0;
  BLOCK_RES *current_block;
@ -226,49 +215,43 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
  int16_t char_quality = 0;
  int16_t accepted_char_quality;

-  if (page_res_it.page_res->rej_count * 100.0 /
-      page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
+  if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
+      tessedit_reject_doc_percent) {
    reject_whole_page(page_res_it);
    if (tessedit_debug_doc_rejection) {
-      tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
-              page_res_it.page_res->char_count,
+      tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
              page_res_it.page_res->rej_count);
    }
  } else {
    if (tessedit_debug_doc_rejection) {
-      tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
-              page_res_it.page_res->char_count,
+      tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n", page_res_it.page_res->char_count,
              page_res_it.page_res->rej_count);
    }

    /* Walk blocks testing for block rejection */

    page_res_it.restart_page();
-    WERD_RES* word;
+    WERD_RES *word;
    while ((word = page_res_it.word()) != nullptr) {
      current_block = page_res_it.block();
      block_no = current_block->block->pdblk.index();
      if (current_block->char_count > 0 &&
          (current_block->rej_count * 100.0 / current_block->char_count) >
-           tessedit_reject_block_percent) {
+              tessedit_reject_block_percent) {
        if (tessedit_debug_block_rejection) {
-          tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
-                  block_no, current_block->char_count,
-                  current_block->rej_count);
+          tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n", block_no,
+                  current_block->char_count, current_block->rej_count);
        }
        prev_word_rejected = false;
-        while ((word = page_res_it.word()) != nullptr &&
-               (page_res_it.block() == current_block)) {
+        while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
          if (tessedit_preserve_blk_rej_perfect_wds) {
            rej_word = word->reject_map.reject_count() > 0 ||
-                word->reject_map.length () < tessedit_preserve_min_wd_len;
+                       word->reject_map.length() < tessedit_preserve_min_wd_len;
            if (rej_word && tessedit_dont_blkrej_good_wds &&
                word->reject_map.length() >= tessedit_preserve_min_wd_len &&
-                acceptable_word_string(
-                    *word->uch_set,
-                    word->best_choice->unichar_string().c_str(),
-                    word->best_choice->unichar_lengths().c_str()) !=
-                AC_UNACCEPTABLE) {
+                acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
+                                       word->best_choice->unichar_lengths().c_str()) !=
+                    AC_UNACCEPTABLE) {
              word_char_quality(word, &char_quality, &accepted_char_quality);
              rej_word = char_quality != word->reject_map.length();
            }
@ -277,14 +260,12 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
          }
          if (rej_word) {
            /*
-              Reject spacing if both current and prev words are rejected.
-              NOTE - this is NOT restricted to FUZZY spaces. - When tried this
-              generated more space errors.
-            */
-            if (tessedit_use_reject_spaces &&
-                prev_word_rejected &&
-                page_res_it.prev_row() == page_res_it.row() &&
-                word->word->space() == 1)
+  Reject spacing if both current and prev words are rejected.
+  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
+  generated more space errors.
+*/
+            if (tessedit_use_reject_spaces && prev_word_rejected &&
+                page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1)
              word->reject_spaces = true;
            word->reject_map.rej_word_block_rej();
          }
@ -293,53 +274,46 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
        }
      } else {
        if (tessedit_debug_block_rejection) {
-          tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
-                  block_no, page_res_it.block()->char_count,
-                  page_res_it.block()->rej_count);
+          tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n", block_no,
+                  page_res_it.block()->char_count, page_res_it.block()->rej_count);
        }

        /* Walk rows in block testing for row rejection */
        row_no = 0;
-        while (page_res_it.word() != nullptr &&
-               page_res_it.block() == current_block) {
+        while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
          current_row = page_res_it.row();
          row_no++;
          /* Reject whole row if:
-            fraction of chars on row which are rejected exceed a limit AND
-            fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
-            limit
-          */
+  fraction of chars on row which are rejected exceed a limit AND
+  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
+  limit
+*/
          if (current_row->char_count > 0 &&
              (current_row->rej_count * 100.0 / current_row->char_count) >
-              tessedit_reject_row_percent &&
-              (current_row->whole_word_rej_count * 100.0 /
-                  current_row->rej_count) <
-              tessedit_whole_wd_rej_row_percent) {
+                  tessedit_reject_row_percent &&
+              (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
+                  tessedit_whole_wd_rej_row_percent) {
            if (tessedit_debug_block_rejection) {
-              tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
-                      row_no, current_row->char_count,
-                      current_row->rej_count);
+              tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n", row_no,
+                      current_row->char_count, current_row->rej_count);
            }
            prev_word_rejected = false;
-            while ((word = page_res_it.word()) != nullptr &&
-                   page_res_it.row () == current_row) {
+            while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
              /* Preserve words on good docs unless they are mostly rejected*/
              if (!tessedit_row_rej_good_docs && good_quality_doc) {
                rej_word = word->reject_map.reject_count() /
-                    static_cast<float>(word->reject_map.length()) >
-                    tessedit_good_doc_still_rowrej_wd;
+                               static_cast<float>(word->reject_map.length()) >
+                           tessedit_good_doc_still_rowrej_wd;
              } else if (tessedit_preserve_row_rej_perfect_wds) {
                /* Preserve perfect words anyway */
                rej_word = word->reject_map.reject_count() > 0 ||
-                    word->reject_map.length () < tessedit_preserve_min_wd_len;
+                           word->reject_map.length() < tessedit_preserve_min_wd_len;
                if (rej_word && tessedit_dont_rowrej_good_wds &&
                    word->reject_map.length() >= tessedit_preserve_min_wd_len &&
-                    acceptable_word_string(*word->uch_set,
-                        word->best_choice->unichar_string().c_str(),
-                        word->best_choice->unichar_lengths().c_str()) !=
-                            AC_UNACCEPTABLE) {
-                  word_char_quality(word, &char_quality,
-                                    &accepted_char_quality);
+                    acceptable_word_string(
+                        *word->uch_set, word->best_choice->unichar_string().c_str(),
+                        word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
+                  word_char_quality(word, &char_quality, &accepted_char_quality);
                  rej_word = char_quality != word->reject_map.length();
                }
              } else {
@ -347,14 +321,12 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
              }
              if (rej_word) {
                /*
-                  Reject spacing if both current and prev words are rejected.
-                  NOTE - this is NOT restricted to FUZZY spaces. - When tried
-                  this generated more space errors.
-                */
-                if (tessedit_use_reject_spaces &&
-                    prev_word_rejected &&
-                    page_res_it.prev_row() == page_res_it.row() &&
-                    word->word->space () == 1)
+  Reject spacing if both current and prev words are rejected.
+  NOTE - this is NOT restricted to FUZZY spaces. - When tried
+  this generated more space errors.
+*/
+                if (tessedit_use_reject_spaces && prev_word_rejected &&
+                    page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1)
                  word->reject_spaces = true;
                word->reject_map.rej_word_row_rej();
              }
@ -363,11 +335,10 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
            }
          } else {
            if (tessedit_debug_block_rejection) {
-              tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
-                      row_no, current_row->char_count, current_row->rej_count);
+              tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n", row_no,
+                      current_row->char_count, current_row->rej_count);
            }
-            while (page_res_it.word() != nullptr &&
-                   page_res_it.row() == current_row)
+            while (page_res_it.word() != nullptr && page_res_it.row() == current_row)
              page_res_it.forward();
          }
        }
@ -376,8 +347,6 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
  }
 }

-}  // namespace tesseract
-
 /*************************************************************************
 * reject_whole_page()
 * Don't believe any of it - set the reject map to 00..00 in all words
@ -385,16 +354,15 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
 *************************************************************************/

 void reject_whole_page(PAGE_RES_IT &page_res_it) {
-  page_res_it.restart_page ();
-  while (page_res_it.word () != nullptr) {
-    page_res_it.word ()->reject_map.rej_word_doc_rej ();
-    page_res_it.forward ();
+  page_res_it.restart_page();
+  while (page_res_it.word() != nullptr) {
+    page_res_it.word()->reject_map.rej_word_doc_rej();
+    page_res_it.forward();
  }
-                                 //whole page is rejected
+  // whole page is rejected
  page_res_it.page_res->rejected = true;
 }

-namespace tesseract {
 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
  WERD_RES *word;
  GARBAGE_LEVEL garbage_level;
@ -405,7 +373,7 @@ void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {

  page_res_it.restart_page();
  while (page_res_it.word() != nullptr) {
-    POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
+    POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
    if (pb != nullptr && !pb->IsText()) {
      page_res_it.forward();
      continue;
@ -418,109 +386,93 @@ void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
    if (crunch_early_merge_tess_fails)
      word->merge_tess_fails();

-    if (word->reject_map.accept_count () != 0) {
+    if (word->reject_map.accept_count() != 0) {
      found_terrible_word = false;
-                                 //Forget earlier potential crunches
+      // Forget earlier potential crunches
      prev_potential_marked = false;
-    }
-    else {
+    } else {
      ok_dict_word = safe_dict_word(word);
      garbage_level = garbage_word(word, ok_dict_word);

-      if ((garbage_level != G_NEVER_CRUNCH) &&
-      (terrible_word_crunch (word, garbage_level))) {
+      if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
        if (crunch_debug > 0) {
-          tprintf ("T CRUNCHING: \"%s\"\n",
-            word->best_choice->unichar_string().c_str());
+          tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
        }
        word->unlv_crunch_mode = CR_KEEP_SPACE;
        if (prev_potential_marked) {
-          while (copy_it.word () != word) {
+          while (copy_it.word() != word) {
            if (crunch_debug > 0) {
-              tprintf ("P1 CRUNCHING: \"%s\"\n",
-                copy_it.word()->best_choice->unichar_string().c_str());
+              tprintf("P1 CRUNCHING: \"%s\"\n",
+                      copy_it.word()->best_choice->unichar_string().c_str());
            }
-            copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
-            copy_it.forward ();
+            copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;
+            copy_it.forward();
          }
          prev_potential_marked = false;
        }
        found_terrible_word = true;
-      }
-      else if ((garbage_level != G_NEVER_CRUNCH) &&
-        (potential_word_crunch (word,
-      garbage_level, ok_dict_word))) {
+      } else if ((garbage_level != G_NEVER_CRUNCH) &&
+                 (potential_word_crunch(word, garbage_level, ok_dict_word))) {
        if (found_terrible_word) {
          if (crunch_debug > 0) {
-            tprintf ("P2 CRUNCHING: \"%s\"\n",
-              word->best_choice->unichar_string().c_str());
+            tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
          }
          word->unlv_crunch_mode = CR_KEEP_SPACE;
-        }
-        else if (!prev_potential_marked) {
+        } else if (!prev_potential_marked) {
          copy_it = page_res_it;
          prev_potential_marked = true;
          if (crunch_debug > 1) {
-            tprintf ("P3 CRUNCHING: \"%s\"\n",
-              word->best_choice->unichar_string().c_str());
+            tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
          }
        }
-      }
-      else {
+      } else {
        found_terrible_word = false;
-                                 //Forget earlier potential crunches
+        // Forget earlier potential crunches
        prev_potential_marked = false;
        if (crunch_debug > 2) {
-          tprintf ("NO CRUNCH: \"%s\"\n",
-            word->best_choice->unichar_string().c_str());
+          tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
        }
      }
    }
-    page_res_it.forward ();
+    page_res_it.forward();
  }
 }

-
-bool Tesseract::terrible_word_crunch(WERD_RES* word,
-                                     GARBAGE_LEVEL garbage_level) {
+bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
  float rating_per_ch;
  int adjusted_len;
  int crunch_mode = 0;

  if ((word->best_choice->unichar_string().length() == 0) ||
      (strspn(word->best_choice->unichar_string().c_str(), " ") ==
-       word->best_choice->unichar_string().unsigned_size()))
+       word->best_choice->unichar_string().size()))
    crunch_mode = 1;
  else {
-    adjusted_len = word->reject_map.length ();
+    adjusted_len = word->reject_map.length();
    if (adjusted_len > crunch_rating_max)
      adjusted_len = crunch_rating_max;
-    rating_per_ch = word->best_choice->rating () / adjusted_len;
+    rating_per_ch = word->best_choice->rating() / adjusted_len;

    if (rating_per_ch > crunch_terrible_rating)
      crunch_mode = 2;
    else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
      crunch_mode = 3;
-    else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
-      (garbage_level != G_OK))
+    else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) && (garbage_level != G_OK))
      crunch_mode = 4;
-    else if ((rating_per_ch > crunch_poor_garbage_rate) &&
-      (garbage_level != G_OK))
+    else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK))
      crunch_mode = 5;
  }
  if (crunch_mode > 0) {
    if (crunch_debug > 2) {
-      tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
-        crunch_mode, word->best_choice->unichar_string().c_str());
+      tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
+              word->best_choice->unichar_string().c_str());
    }
    return true;
-  }
-  else
+  } else
    return false;
 }

-bool Tesseract::potential_word_crunch(WERD_RES* word,
-                                      GARBAGE_LEVEL garbage_level,
+bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level,
                                      bool ok_dict_word) {
  float rating_per_ch;
  int adjusted_len;
@ -529,11 +481,9 @@ bool Tesseract::potential_word_crunch(WERD_RES* word,
  bool word_crunchable;
  int poor_indicator_count = 0;

-  word_crunchable = !crunch_leave_accept_strings ||
-                    word->reject_map.length() < 3 ||
-                    (acceptable_word_string(*word->uch_set,
-                                            str, lengths) == AC_UNACCEPTABLE &&
-                     !ok_dict_word);
+  word_crunchable =
+      !crunch_leave_accept_strings || word->reject_map.length() < 3 ||
+      (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);

  adjusted_len = word->reject_map.length();
  if (adjusted_len > 10)
@ -542,25 +492,21 @@ bool Tesseract::potential_word_crunch(WERD_RES* word,

  if (rating_per_ch > crunch_pot_poor_rate) {
    if (crunch_debug > 2) {
-      tprintf("Potential poor rating on \"%s\"\n",
-              word->best_choice->unichar_string().c_str());
+      tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
    }
    poor_indicator_count++;
  }

-  if (word_crunchable &&
-      word->best_choice->certainty() < crunch_pot_poor_cert) {
+  if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
    if (crunch_debug > 2) {
-      tprintf("Potential poor cert on \"%s\"\n",
-              word->best_choice->unichar_string().c_str());
+      tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
    }
    poor_indicator_count++;
  }

  if (garbage_level != G_OK) {
    if (crunch_debug > 2) {
-      tprintf("Potential garbage on \"%s\"\n",
-              word->best_choice->unichar_string().c_str());
+      tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
    }
    poor_indicator_count++;
  }
@ -581,62 +527,55 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
  while (page_res_it.word() != nullptr) {
    word = page_res_it.word();

-    delete_mode = word_deletable (word, debug_delete_mode);
+    delete_mode = word_deletable(word, debug_delete_mode);
    if (delete_mode != CR_NONE) {
-      if (word->word->flag (W_BOL) || deleting_from_bol) {
+      if (word->word->flag(W_BOL) || deleting_from_bol) {
        if (crunch_debug > 0) {
-          tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
-            debug_delete_mode,
-            word->best_choice->unichar_string().c_str());
+          tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
+                  word->best_choice->unichar_string().c_str());
        }
        word->unlv_crunch_mode = delete_mode;
        deleting_from_bol = true;
      } else if (word->word->flag(W_EOL)) {
        if (marked_delete_point) {
          while (copy_it.word() != word) {
-            x_delete_mode = word_deletable (copy_it.word (),
-              x_debug_delete_mode);
+            x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
            if (crunch_debug > 0) {
-              tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
-                x_debug_delete_mode,
-                copy_it.word()->best_choice->unichar_string().c_str());
+              tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
+                      copy_it.word()->best_choice->unichar_string().c_str());
            }
-            copy_it.word ()->unlv_crunch_mode = x_delete_mode;
-            copy_it.forward ();
+            copy_it.word()->unlv_crunch_mode = x_delete_mode;
+            copy_it.forward();
          }
        }
        if (crunch_debug > 0) {
-          tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
-            debug_delete_mode,
-            word->best_choice->unichar_string().c_str());
+          tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
+                  word->best_choice->unichar_string().c_str());
        }
        word->unlv_crunch_mode = delete_mode;
        deleting_from_bol = false;
        marked_delete_point = false;
-      }
-      else {
+      } else {
        if (!marked_delete_point) {
          copy_it = page_res_it;
          marked_delete_point = true;
        }
      }
-    }
-    else {
+    } else {
      deleting_from_bol = false;
-                                 //Forget earlier potential crunches
+      // Forget earlier potential crunches
      marked_delete_point = false;
    }
    /*
-      The following step has been left till now as the tess fails are used to
-      determine if the word is deletable.
-    */
+  The following step has been left till now as the tess fails are used to
+  determine if the word is deletable.
+*/
    if (!crunch_early_merge_tess_fails)
      word->merge_tess_fails();
-    page_res_it.forward ();
+    page_res_it.forward();
  }
 }

-
 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
  int i;
  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
@ -646,20 +585,19 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
  for (i = 0; i < word_res->reject_map.length(); ++i) {
    if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
      word_res->best_choice->set_unichar_id(unichar_dash, i);
-      if (word_res->reject_map[i].accepted ())
-        word_res->reject_map[i].setrej_unlv_rej ();
+      if (word_res->reject_map[i].accepted())
+        word_res->reject_map[i].setrej_unlv_rej();
    }
    if (word_res->best_choice->unichar_id(i) == unichar_pow) {
      word_res->best_choice->set_unichar_id(unichar_space, i);
-      if (word_res->reject_map[i].accepted ())
-        word_res->reject_map[i].setrej_unlv_rej ();
+      if (word_res->reject_map[i].accepted())
+        word_res->reject_map[i].setrej_unlv_rej();
    }
  }
 }

 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
-  enum STATES
-  {
+  enum STATES {
    JUNK,
    FIRST_UPPER,
    FIRST_LOWER,
@ -690,7 +628,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {

  for (; *str != '\0'; str += *(lengths++)) {
    len++;
-    if (word->uch_set->get_isupper (str, *lengths)) {
+    if (word->uch_set->get_isupper(str, *lengths)) {
      total_alpha_count++;
      switch (state) {
        case SUBSEQUENT_UPPER:
@ -704,8 +642,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
            if (longest_alpha_repetition_count < alpha_repetition_count) {
              longest_alpha_repetition_count = alpha_repetition_count;
            }
-          }
-          else {
+          } else {
            last_char = word->uch_set->unichar_to_id(str, *lengths);
            alpha_repetition_count = 1;
          }
@ -720,8 +657,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
          upper_string_count = 1;
          break;
      }
-    }
-    else if (word->uch_set->get_islower (str, *lengths)) {
+    } else if (word->uch_set->get_islower(str, *lengths)) {
      total_alpha_count++;
      switch (state) {
        case SUBSEQUENT_LOWER:
@ -735,8 +671,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
            if (longest_alpha_repetition_count < alpha_repetition_count) {
              longest_alpha_repetition_count = alpha_repetition_count;
            }
-          }
-          else {
+          } else {
            last_char = word->uch_set->unichar_to_id(str, *lengths);
            alpha_repetition_count = 1;
          }
@ -751,8 +686,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
          lower_string_count = 1;
          break;
      }
-    }
-    else if (word->uch_set->get_isdigit (str, *lengths)) {
+    } else if (word->uch_set->get_isdigit(str, *lengths)) {
      total_digit_count++;
      switch (state) {
        case FIRST_NUM:
@ -767,8 +701,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
          state = FIRST_NUM;
          break;
      }
-    }
-    else {
+    } else {
      if (*lengths == 1 && *str == ' ')
        tess_rejs++;
      else
@ -802,63 +735,51 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
    total_alpha_count += total_digit_count - isolated_digits;
  }

-  if (crunch_leave_ok_strings && len >= 4 &&
-      2 * (total_alpha_count - isolated_alphas) > len &&
+  if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
      longest_alpha_repetition_count < crunch_long_repetitions) {
    if ((crunch_accept_ok &&
-         acceptable_word_string(*word->uch_set, str, lengths) !=
-             AC_UNACCEPTABLE) ||
+         acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
        longest_lower_run_len > crunch_leave_lc_strings ||
        longest_upper_run_len > crunch_leave_uc_strings)
      return G_NEVER_CRUNCH;
  }
-  if (word->reject_map.length() > 1 &&
-      strpbrk(str, " ") == nullptr &&
+  if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
      (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
       word->best_choice->permuter() == FREQ_DAWG_PERM ||
       word->best_choice->permuter() == USER_DAWG_PERM ||
       word->best_choice->permuter() == NUMBER_PERM ||
-       acceptable_word_string(*word->uch_set, str, lengths) !=
-           AC_UNACCEPTABLE || ok_dict_word))
+       acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word))
    return G_OK;

-  ok_chars = len - bad_char_count - isolated_digits -
-    isolated_alphas - tess_rejs;
+  ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;

  if (crunch_debug > 3) {
-    tprintf("garbage_word: \"%s\"\n",
-            word->best_choice->unichar_string().c_str());
-    tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
-            len,
-            bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
+    tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
+    tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n", len, bad_char_count,
+            isolated_digits, isolated_alphas, tess_rejs);
  }
-  if (bad_char_count == 0 &&
-      tess_rejs == 0 &&
+  if (bad_char_count == 0 && tess_rejs == 0 &&
      (len > isolated_digits + isolated_alphas || len <= 2))
    return G_OK;

-  if (tess_rejs > ok_chars ||
-      (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
+  if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
    return G_TERRIBLE;

  if (len > 4) {
-    dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
-        isolated_alphas;
+    dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
    if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
      return G_DODGY;
    else
      return G_OK;
  } else {
    dodgy_chars = 2 * tess_rejs + bad_char_count;
-    if ((len == 4 && dodgy_chars > 2) ||
-        (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
+    if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
      return G_DODGY;
    else
      return G_OK;
  }
 }

-
 /*************************************************************************
 * word_deletable()
 *     DELETE WERDS AT ENDS OF ROWS IF
@ -876,9 +797,9 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
 *************************************************************************/

 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
-  int word_len = word->reject_map.length ();
+  int word_len = word->reject_map.length();
  float rating_per_ch;
-  TBOX box;                       //BB of word
+  TBOX box; // BB of word

  if (word->unlv_crunch_mode == CR_NONE) {
    delete_mode = 0;
@ -893,7 +814,7 @@ CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
  if (word->rebuild_word != nullptr) {
    // Cube leaves rebuild_word nullptr.
    box = word->rebuild_word->bounding_box();
-    if (box.height () < crunch_del_min_ht * kBlnXHeight) {
+    if (box.height() < crunch_del_min_ht * kBlnXHeight) {
      delete_mode = 4;
      return CR_DELETE;
    }
@ -904,40 +825,39 @@ CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
    }
  }

-  if ((failure_count (word) * 1.5) > word_len) {
+  if ((failure_count(word) * 1.5) > word_len) {
    delete_mode = 2;
    return CR_LOOSE_SPACE;
  }

-  if (word->best_choice->certainty () < crunch_del_cert) {
+  if (word->best_choice->certainty() < crunch_del_cert) {
    delete_mode = 7;
    return CR_LOOSE_SPACE;
  }

-  rating_per_ch = word->best_choice->rating () / word_len;
+  rating_per_ch = word->best_choice->rating() / word_len;

  if (rating_per_ch > crunch_del_rating) {
    delete_mode = 8;
    return CR_LOOSE_SPACE;
  }

-  if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
+  if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
    delete_mode = 9;
    return CR_LOOSE_SPACE;
  }

-  if (box.bottom () >
-  kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
+  if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
    delete_mode = 10;
    return CR_LOOSE_SPACE;
  }

-  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
+  if (box.height() > crunch_del_max_ht * kBlnXHeight) {
    delete_mode = 11;
    return CR_LOOSE_SPACE;
  }

-  if (box.width () < crunch_del_min_width * kBlnXHeight) {
+  if (box.width() < crunch_del_min_width * kBlnXHeight) {
    delete_mode = 3;
    return CR_LOOSE_SPACE;
  }
@ -957,17 +877,16 @@ int16_t Tesseract::failure_count(WERD_RES *word) {
  return tess_rejs;
 }

-
-bool Tesseract::noise_outlines(TWERD* word) {
-  TBOX box;                       // BB of outline
+bool Tesseract::noise_outlines(TWERD *word) {
+  TBOX box; // BB of outline
  int16_t outline_count = 0;
  int16_t small_outline_count = 0;
  int16_t max_dimension;
  float small_limit = kBlnXHeight * crunch_small_outlines_size;

  for (int b = 0; b < word->NumBlobs(); ++b) {
-    TBLOB* blob = word->blobs[b];
-    for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
+    TBLOB *blob = word->blobs[b];
+    for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
      outline_count++;
      box = ol->bounding_box();
      if (box.height() > box.width())
@ -981,4 +900,4 @@ bool Tesseract::noise_outlines(TWERD* word) {
  return small_outline_count >= outline_count;
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/ccmain/docqual.h
+++ b/src/ccmain/docqual.h
@ -19,21 +19,19 @@
 #ifndef DOCQUAL_H
 #define DOCQUAL_H

-#include <cstdint>  // for int16_t
+#include <cstdint> // for int16_t
+
+namespace tesseract {

 class PAGE_RES_IT;
 class ROW;
 class WERD_RES;

-enum GARBAGE_LEVEL
-{
-  G_NEVER_CRUNCH,
-  G_OK,
-  G_DODGY,
-  G_TERRIBLE
-};
+enum GARBAGE_LEVEL { G_NEVER_CRUNCH, G_OK, G_DODGY, G_TERRIBLE };

-int16_t word_blob_quality(WERD_RES* word);
+int16_t word_blob_quality(WERD_RES *word);
 void reject_whole_page(PAGE_RES_IT &page_res_it);

+} // namespace tesseract
+
 #endif
--- a/src/ccmain/equationdetect.cpp
+++ b/src/ccmain/equationdetect.cpp
--- a/src/ccmain/equationdetect.h
+++ b/src/ccmain/equationdetect.h
@ -19,11 +19,11 @@
 #ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_
 #define TESSERACT_CCMAIN_EQUATIONDETECT_H_

-#include "blobbox.h"             // for BLOBNBOX (ptr only), BlobSpecialText...
-#include "equationdetectbase.h"  // for EquationDetectBase
-#include <tesseract/genericvector.h>       // for GenericVector
-#include "tesseractclass.h"      // for Tesseract
-#include <tesseract/unichar.h>             // for UNICHAR_ID
+#include <tesseract/unichar.h>  // for UNICHAR_ID
+#include "blobbox.h"            // for BLOBNBOX (ptr only), BlobSpecialText...
+#include "equationdetectbase.h" // for EquationDetectBase
+#include "genericvector.h"      // for GenericVector
+#include "tesseractclass.h"     // for Tesseract

 class TBOX;
 class UNICHARSET;
@ -35,54 +35,46 @@ class ColPartition;
 class ColPartitionGrid;
 class ColPartitionSet;

-class EquationDetect : public EquationDetectBase {
- public:
-  EquationDetect(const char* equ_datapath,
-                 const char* equ_language);
+class TESS_API EquationDetect : public EquationDetectBase {
+public:
+  EquationDetect(const char *equ_datapath, const char *equ_language);
  ~EquationDetect() override;

-  enum IndentType {
-    NO_INDENT,
-    LEFT_INDENT,
-    RIGHT_INDENT,
-    BOTH_INDENT,
-    INDENT_TYPE_COUNT
-  };
+  enum IndentType { NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT, INDENT_TYPE_COUNT };

  // Reset the lang_tesseract_ pointer. This function should be called before we
  // do any detector work.
-  void SetLangTesseract(Tesseract* lang_tesseract);
+  void SetLangTesseract(Tesseract *lang_tesseract);

  // Iterate over the blobs inside to_block, and set the blobs that we want to
  // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
  // returns 0 upon success.
-  int LabelSpecialText(TO_BLOCK* to_block) override;
+  int LabelSpecialText(TO_BLOCK *to_block) override;

  // Find possible equation partitions from part_grid. Should be called
  // after the special_text_type of blobs are set.
  // It returns 0 upon success.
-  int FindEquationParts(ColPartitionGrid* part_grid,
-                        ColPartitionSet** best_columns) override;
+  int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) override;

  // Reset the resolution of the processing image. TEST only function.
  void SetResolution(const int resolution);

- protected:
+protected:
  // Identify the special text type for one blob, and update its field. When
  // height_th is set (> 0), we will label the blob as BSTT_NONE if its height
  // is less than height_th.
  void IdentifySpecialText(BLOBNBOX *blob, const int height_th);

  // Estimate the type for one unichar.
-  BlobSpecialTextType EstimateTypeForUnichar(
-      const UNICHARSET& unicharset, const UNICHAR_ID id) const;
+  BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset,
+                                             const UNICHAR_ID id) const;

  // Compute special text type for each blobs in part_grid_.
  void IdentifySpecialText();

  // Identify blobs that we want to skip during special blob type
  // classification.
-  void IdentifyBlobsToSkip(ColPartition* part);
+  void IdentifyBlobsToSkip(ColPartition *part);

  // The ColPartitions in part_grid_ maybe over-segmented, particularly in the
  // block equation regions. So we like to identify these partitions and merge
@ -94,62 +86,56 @@ class EquationDetect : public EquationDetectBase {
  // parts_overlap. Note: this function may update the part_grid_, so if the
  // caller is also running ColPartitionGridSearch, use the RepositionIterator
  // to continue.
-  void SearchByOverlap(ColPartition* seed,
-                       GenericVector<ColPartition*>* parts_overlap);
+  void SearchByOverlap(ColPartition *seed, GenericVector<ColPartition *> *parts_overlap);

  // Insert part back into part_grid_, after it absorbs some other parts.
-  void InsertPartAfterAbsorb(ColPartition* part);
+  void InsertPartAfterAbsorb(ColPartition *part);

  // Identify the colparitions in part_grid_, label them as PT_EQUATION, and
  // save them into cp_seeds_.
  void IdentifySeedParts();

  // Check the blobs count for a seed region candidate.
-  bool CheckSeedBlobsCount(ColPartition* part);
+  bool CheckSeedBlobsCount(ColPartition *part);

  // Compute the foreground pixel density for a tbox area.
-  float ComputeForegroundDensity(const TBOX& tbox);
+  float ComputeForegroundDensity(const TBOX &tbox);

  // Check if part from seed2 label: with low math density and left indented. We
  // are using two checks:
  // 1. If its left is aligned with any coordinates in indented_texts_left,
  // which we assume have been sorted.
  // 2. If its foreground density is over foreground_density_th.
-  bool CheckForSeed2(
-      const GenericVector<int>& indented_texts_left,
-      const float foreground_density_th,
-      ColPartition* part);
+  bool CheckForSeed2(const GenericVector<int> &indented_texts_left,
+                     const float foreground_density_th, ColPartition *part);

  // Count the number of values in sorted_vec that is close to val, used to
  // check if a partition is aligned with text partitions.
-  int CountAlignment(
-      const GenericVector<int>& sorted_vec, const int val) const;
+  int CountAlignment(const GenericVector<int> &sorted_vec, const int val) const;

  // Check for a seed candidate using the foreground pixel density. And we
  // return true if the density is below a certain threshold, because characters
  // in equation regions usually are apart with more white spaces.
-  bool CheckSeedFgDensity(const float density_th, ColPartition* part);
+  bool CheckSeedFgDensity(const float density_th, ColPartition *part);

  // A light version of SplitCPHor: instead of really doing the part split, we
  // simply compute the union bounding box of each split part.
-  void SplitCPHorLite(ColPartition* part, GenericVector<TBOX>* splitted_boxes);
+  void SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes);

  // Split the part (horizontally), and save the split result into
  // parts_splitted. Note that it is caller's responsibility to release the
  // memory owns by parts_splitted. On the other hand, the part is unchanged
  // during this process and still owns the blobs, so do NOT call DeleteBoxes
  // when freeing the colpartitions in parts_splitted.
-  void SplitCPHor(ColPartition* part,
-                  GenericVector<ColPartition*>* parts_splitted);
+  void SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted);

  // Check the density for a seed candidate (part) using its math density and
  // italic density, returns true if the check passed.
-  bool CheckSeedDensity(const float math_density_high,
-                        const float math_density_low,
-                        const ColPartition* part) const;
+  bool CheckSeedDensity(const float math_density_high, const float math_density_low,
+                        const ColPartition *part) const;

  // Check if part is indented.
-  IndentType IsIndented(ColPartition* part);
+  IndentType IsIndented(ColPartition *part);

  // Identify inline partitions from cp_seeds_, and re-label them.
  void IdentifyInlineParts();
@ -165,38 +151,32 @@ class EquationDetect : public EquationDetectBase {
  int EstimateTextPartLineSpacing();

  // Identify inline partitions from cp_seeds_ using vertical search.
-  void IdentifyInlinePartsVertical(const bool top_to_bottom,
-                                   const int textPartsLineSpacing);
+  void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing);

  // Check if part is an inline equation zone. This should be called after we
  // identified the seed regions.
-  bool IsInline(const bool search_bottom,
-                const int textPartsLineSpacing,
-                ColPartition* part);
+  bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part);

  // For a given seed partition, we search the part_grid_ and see if there is
  // any partition can be merged with it. It returns true if the seed has been
  // expanded.
-  bool ExpandSeed(ColPartition* seed);
+  bool ExpandSeed(ColPartition *seed);

  // Starting from the seed position, we search the part_grid_
  // horizontally/vertically, find all partitions that can be
  // merged with seed, remove them from part_grid_, and put them  into
  // parts_to_merge.
-  void ExpandSeedHorizontal(const bool search_left,
-                            ColPartition* seed,
-                            GenericVector<ColPartition*>* parts_to_merge);
-  void ExpandSeedVertical(const bool search_bottom,
-                          ColPartition* seed,
-                          GenericVector<ColPartition*>* parts_to_merge);
+  void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
+                            GenericVector<ColPartition *> *parts_to_merge);
+  void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
+                          GenericVector<ColPartition *> *parts_to_merge);

  // Check if a part_box is the small neighbor of seed_box.
-  bool IsNearSmallNeighbor(const TBOX& seed_box,
-                           const TBOX& part_box) const;
+  bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;

  // Perform the density check for part, which we assume is nearing a seed
  // partition. It returns true if the check passed.
-  bool CheckSeedNeighborDensity(const ColPartition* part) const;
+  bool CheckSeedNeighborDensity(const ColPartition *part) const;

  // After identify the math blocks, we do one more scanning on all text
  // partitions, and check if any of them is the satellite of:
@ -210,56 +190,54 @@ class EquationDetect : public EquationDetectBase {

  // Check if part is the satellite of one/two math blocks. If it is, we return
  // true, and save the blocks into math_blocks.
-  bool IsMathBlockSatellite(
-      ColPartition* part, GenericVector<ColPartition*>* math_blocks);
+  bool IsMathBlockSatellite(ColPartition *part, GenericVector<ColPartition *> *math_blocks);

  // Search the nearest neighbor of part in one vertical direction as defined in
  // search_bottom. It returns the neighbor found that major x overlap with it,
  // or nullptr when not found.
-  ColPartition* SearchNNVertical(const bool search_bottom,
-                                 const ColPartition* part);
+  ColPartition *SearchNNVertical(const bool search_bottom, const ColPartition *part);

  // Check if the neighbor with vertical distance of y_gap is a near and math
  // block partition.
  bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;

  // Generate the tiff file name for output/debug file.
-  void GetOutputTiffName(const char* name, STRING* image_name) const;
+  void GetOutputTiffName(const char *name, std::string &image_name) const;

  // Debugger function that renders ColPartitions on the input image, where:
  // parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
  // will be painted in green, and other parts will be painted in blue.
-  void PaintColParts(const STRING& outfile) const;
+  void PaintColParts(const std::string &outfile) const;

  // Debugger function that renders the blobs in part_grid_ over the input
  // image.
-  void PaintSpecialTexts(const STRING& outfile) const;
+  void PaintSpecialTexts(const std::string &outfile) const;

  // Debugger function that print the math blobs density values for a
  // ColPartition object.
-  void PrintSpecialBlobsDensity(const ColPartition* part) const;
+  void PrintSpecialBlobsDensity(const ColPartition *part) const;

  // The tesseract engine initialized from equation training data.
  Tesseract equ_tesseract_;

  // The tesseract engine used for OCR. This pointer is passed in by the caller,
  // so do NOT destroy it in this class.
-  Tesseract* lang_tesseract_;
+  Tesseract *lang_tesseract_;

  // The ColPartitionGrid that we are processing. This pointer is passed in from
  // the caller, so do NOT destroy it in the class.
-  ColPartitionGrid* part_grid_ = nullptr;
+  ColPartitionGrid *part_grid_ = nullptr;

  // A simple array of pointers to the best assigned column division at
  // each grid y coordinate. This pointer is passed in from the caller, so do
  // NOT destroy it in the class.
-  ColPartitionSet** best_columns_ = nullptr;
+  ColPartitionSet **best_columns_ = nullptr;

  // The super bounding box of all cps in the part_grid_.
-  TBOX* cps_super_bbox_;
+  TBOX *cps_super_bbox_;

  // The seed ColPartition for equation region.
-  GenericVector<ColPartition*> cp_seeds_;
+  GenericVector<ColPartition *> cp_seeds_;

  // The resolution (dpi) of the processing image.
  int resolution_;
@ -268,6 +246,6 @@ class EquationDetect : public EquationDetectBase {
  int page_count_;
 };

-}  // namespace tesseract
+} // namespace tesseract

-#endif  // TESSERACT_CCMAIN_EQUATIONDETECT_H_
+#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_
--- a/src/ccmain/fixspace.cpp
+++ b/src/ccmain/fixspace.cpp
@ -19,31 +19,33 @@
 **********************************************************************/

 #include "fixspace.h"
-#include <cstdint>             // for INT16_MAX, int16_t, int32_t
-#include "blobs.h"             // for TWERD, TBLOB, TESSLINE
-#include "boxword.h"           // for BoxWord
-#include "errcode.h"           // for ASSERT_HOST
-#include "normalis.h"          // for kBlnXHeight, kBlnBaselineOffset
-#include <tesseract/ocrclass.h>          // for ETEXT_DESC
-#include "pageres.h"           // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
-#include "params.h"            // for IntParam, StringParam, BoolParam, Doub...
-#include "ratngs.h"            // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
-#include "rect.h"              // for TBOX
-#include "stepblob.h"          // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
-#include <tesseract/strngs.h>            // for STRING
-#include "tesseractclass.h"    // for Tesseract, TesseractStats, WordData
-#include "tessvars.h"          // for debug_fp
-#include "tprintf.h"           // for tprintf
-#include <tesseract/unichar.h>           // for UNICHAR_ID
-#include "unicharset.h"        // for UNICHARSET
-#include "werd.h"              // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
+
+#include "blobs.h"          // for TWERD, TBLOB, TESSLINE
+#include "boxword.h"        // for BoxWord
+#include "errcode.h"        // for ASSERT_HOST
+#include "normalis.h"       // for kBlnXHeight, kBlnBaselineOffset
+#include "pageres.h"        // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
+#include "params.h"         // for IntParam, StringParam, BoolParam, Doub...
+#include "ratngs.h"         // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
+#include "rect.h"           // for TBOX
+#include "stepblob.h"       // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
+#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
+#include "tessvars.h"       // for debug_fp
+#include "tprintf.h"        // for tprintf
+#include "unicharset.h"     // for UNICHARSET
+#include "werd.h"           // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
+
+#include <tesseract/ocrclass.h> // for ETEXT_DESC
+#include <tesseract/unichar.h>  // for UNICHAR_ID
+
+#include <cstdint> // for INT16_MAX, int16_t, int32_t
+
+namespace tesseract {

 class BLOCK;
 class ROW;

-#define PERFECT_WERDS   999
-
-namespace tesseract {
+#define PERFECT_WERDS 999

 /**********************************************************************
 *  c_blob_comparator()
@ -52,14 +54,14 @@ namespace tesseract {
 *  order of left edge.
 **********************************************************************/

-static int c_blob_comparator(              // sort blobs
-                      const void *blob1p,  // ptr to ptr to blob1
-                      const void *blob2p   // ptr to ptr to blob2
-                     ) {
-  const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB* const*>(blob1p);
-  const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB* const*>(blob2p);
+static int c_blob_comparator( // sort blobs
+    const void *blob1p,       // ptr to ptr to blob1
+    const void *blob2p        // ptr to ptr to blob2
+) {
+  const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB *const *>(blob1p);
+  const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB *const *>(blob2p);

-  return blob1->bounding_box ().left () - blob2->bounding_box ().left ();
+  return blob1->bounding_box().left() - blob2->bounding_box().left();
 }

 /**
@ -72,9 +74,7 @@ static int c_blob_comparator(              // sort blobs
 * @param word_count count of words in doc
 * @param[out] page_res
 */
-void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
-                                 int32_t word_count,
-                                 PAGE_RES *page_res) {
+void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {
  BLOCK_RES_IT block_res_it;
  ROW_RES_IT row_res_it;
  WERD_RES_IT word_res_it_from;
@ -82,16 +82,14 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
  WERD_RES *word_res;
  WERD_RES_LIST fuzzy_space_words;
  int16_t new_length;
-  bool prevent_null_wd_fixsp;   // DON'T process blobless wds
-  int32_t word_index;              // current word
+  bool prevent_null_wd_fixsp; // DON'T process blobless wds
+  int32_t word_index;         // current word

  block_res_it.set_to_list(&page_res->block_res_list);
  word_index = 0;
-  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
-       block_res_it.forward()) {
+  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
    row_res_it.set_to_list(&block_res_it.data()->row_res_list);
-    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
-         row_res_it.forward()) {
+    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
      word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
      while (!word_res_it_from.at_last()) {
        word_res = word_res_it_from.data();
@ -99,8 +97,7 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
               !(word_res->combination ||
                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
-          fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
-                         block_res_it.data()->block);
+          fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
          word_res = word_res_it_from.forward();
          word_index++;
          if (monitor != nullptr) {
@ -109,14 +106,13 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
            if (monitor->deadline_exceeded() ||
                (monitor->cancel != nullptr &&
                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
-            return;
+              return;
          }
        }

        if (!word_res_it_from.at_last()) {
          word_res_it_to = word_res_it_from;
-          prevent_null_wd_fixsp =
-            word_res->word->cblob_list()->empty();
+          prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
          if (check_debug_pt(word_res, 60))
            debug_fix_space_level.set_value(10);
          word_res_it_to.forward();
@ -127,9 +123,9 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
            if (monitor->deadline_exceeded() ||
                (monitor->cancel != nullptr &&
                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
-            return;
+              return;
          }
-          while (!word_res_it_to.at_last () &&
+          while (!word_res_it_to.at_last() &&
                 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
                  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
            if (check_debug_pt(word_res, 60))
@ -145,39 +141,32 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
          if (prevent_null_wd_fixsp) {
            word_res_it_from = word_res_it_to;
          } else {
-            fuzzy_space_words.assign_to_sublist(&word_res_it_from,
-                                                &word_res_it_to);
-            fix_fuzzy_space_list(fuzzy_space_words,
-                                 row_res_it.data()->row,
+            fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
+            fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
                                 block_res_it.data()->block);
            new_length = fuzzy_space_words.length();
            word_res_it_from.add_list_before(&fuzzy_space_words);
-            for (;
-                 !word_res_it_from.at_last() && new_length > 0;
-                 new_length--) {
+            for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
              word_res_it_from.forward();
            }
          }
          if (test_pt)
            debug_fix_space_level.set_value(0);
        }
-        fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
-                       block_res_it.data()->block);
+        fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
        // Last word in row
      }
    }
  }
 }

-void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
-                                     ROW *row,
-                                     BLOCK* block) {
+void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
  int16_t best_score;
  WERD_RES_LIST current_perm;
  int16_t current_score;
  bool improved = false;

-  best_score = eval_word_spacing(best_perm);  // default score
+  best_score = eval_word_spacing(best_perm); // default score
  dump_words(best_perm, best_score, 1, improved);

  if (best_score != PERFECT_WERDS)
@ -199,8 +188,6 @@ void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
  dump_words(best_perm, best_score, 3, improved);
 }

-}  // namespace tesseract
-
 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
  WERD_RES_IT src_it(&src_list);
  WERD_RES_IT new_it(&new_list);
@ -218,10 +205,7 @@ void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
  }
 }

-
-namespace tesseract {
-void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
-                                    BLOCK* block) {
+void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {
  WERD_RES_IT word_it(&words);
  WERD_RES *word;
  // Since we are not using PAGE_RES to iterate over words, we need to update
@ -253,10 +237,10 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
 * The solution is to NOT COUNT the score of any word which has a digit at one
 * end and a "1Il" as the character the other side of the space.
 *
- * Conversely, any character next to a "1" within a word is counted as a positive
- * score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of
- * the "1" joined).  "56163" would score 7 - all chars in a numeric word + 2
- * sides of a "1" joined.
+ * Conversely, any character next to a "1" within a word is counted as a
+ * positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1
+ * side of the "1" joined).  "56163" would score 7 - all chars in a numeric word
+ * + 2 sides of a "1" joined.
 *
 * The joined 1 rule is applied to any word REGARDLESS of contextual
 * confirmation.  Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
@ -268,24 +252,19 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
  int16_t total_score = 0;
  int16_t word_count = 0;
  int16_t done_word_count = 0;
-  int16_t word_len;
  int16_t i;
  int16_t offset;
-  WERD_RES *word;                 // current word
  int16_t prev_word_score = 0;
  bool prev_word_done = false;
-  bool prev_char_1 = false;      // prev ch a "1/I/l"?
-  bool prev_char_digit = false;  // prev ch 2..9 or 0
-  bool current_char_1 = false;
-  bool current_word_ok_so_far;
-  STRING punct_chars = "!\"`',.:;";
+  bool prev_char_1 = false;     // prev ch a "1/I/l"?
+  bool prev_char_digit = false; // prev ch 2..9 or 0
+  const char *punct_chars = "!\"`',.:;";
  bool prev_char_punct = false;
-  bool current_char_punct = false;
-  bool word_done = false;

  do {
-    word = word_res_it.data();
-    word_done = fixspace_thinks_word_done(word);
+    // current word
+    WERD_RES *word = word_res_it.data();
+    bool word_done = fixspace_thinks_word_done(word);
    word_count++;
    if (word->tess_failed) {
      total_score += prev_word_score;
@ -297,19 +276,18 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
      prev_word_done = false;
    } else {
      /*
-        Can we add the prev word score and potentially count this word?
-        Yes IF it didn't end in a 1 when the first char of this word is a digit
-          AND it didn't end in a digit when the first char of this word is a 1
-      */
-      word_len = word->reject_map.length();
-      current_word_ok_so_far = false;
+  Can we add the prev word score and potentially count this word?
+  Yes IF it didn't end in a 1 when the first char of this word is a digit
+    AND it didn't end in a digit when the first char of this word is a 1
+*/
+      auto word_len = word->reject_map.length();
+      bool current_word_ok_so_far = false;
      if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
-            (prev_char_digit && (
-                (word_done &&
-                 word->best_choice->unichar_lengths().c_str()[0] == 1 &&
-                 word->best_choice->unichar_string()[0] == '1') ||
-                (!word_done && STRING(conflict_set_I_l_1).contains(
-                      word->best_choice->unichar_string()[0])))))) {
+            (prev_char_digit &&
+             ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
+               word->best_choice->unichar_string()[0] == '1') ||
+              (!word_done &&
+               conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
        total_score += prev_word_score;
        if (prev_word_done)
          done_word_count++;
@ -325,33 +303,33 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
      }

      /* Add 1 to total score for every joined 1 regardless of context and
-         rejtn */
+   rejtn */
      for (i = 0, prev_char_1 = false; i < word_len; i++) {
-        current_char_1 = word->best_choice->unichar_string()[i] == '1';
+        bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
        if (prev_char_1 || (current_char_1 && (i > 0)))
          total_score++;
        prev_char_1 = current_char_1;
      }

      /* Add 1 to total score for every joined punctuation regardless of context
-        and rejtn */
+  and rejtn */
      if (tessedit_prefer_joined_punct) {
        for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
             offset += word->best_choice->unichar_lengths()[i++]) {
-          current_char_punct =
-            punct_chars.contains(word->best_choice->unichar_string()[offset]);
+          bool current_char_punct =
+              strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
          if (prev_char_punct || (current_char_punct && i > 0))
            total_score++;
          prev_char_punct = current_char_punct;
        }
      }
      prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
-      for (i = 0, offset = 0; i < word_len - 1;
-           offset += word->best_choice->unichar_lengths()[i++]);
+      for (i = 0, offset = 0; i < word_len - 1; offset += word->best_choice->unichar_lengths()[i++])
+        ;
      prev_char_1 =
-          ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
-           || (!word_done && STRING(conflict_set_I_l_1).contains(
-                   word->best_choice->unichar_string()[offset])));
+          ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
+           (!word_done &&
+            conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
    }
    /* Find next word */
    do {
@ -371,20 +349,15 @@ bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
  int i;
  int offset;

-  for (i = 0, offset = 0; i < char_position;
-       offset += word->best_choice->unichar_lengths()[i++]);
+  for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++])
+    ;
  return (
-      word->uch_set->get_isdigit(
-          word->best_choice->unichar_string().c_str() + offset,
-          word->best_choice->unichar_lengths()[i]) ||
+      word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
+                                 word->best_choice->unichar_lengths()[i]) ||
      (word->best_choice->permuter() == NUMBER_PERM &&
-       STRING(numeric_punctuation).contains(
-           word->best_choice->unichar_string().c_str()[offset])));
+       numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
 }

-}  // namespace tesseract
-
-
 /**
 * @name transform_to_next_perm()
 * Examines the current word list to find the smallest word gap size. Then walks
@ -421,11 +394,10 @@ void transform_to_next_perm(WERD_RES_LIST &words) {
    }
  }
  if (min_gap < INT16_MAX) {
-    prev_right = -INT16_MAX;        // back to start
+    prev_right = -INT16_MAX; // back to start
    word_it.set_to_list(&words);
    // Note: we can't use cycle_pt due to inserted combos at start of list.
-    for (; (prev_right == -INT16_MAX) || !word_it.at_first();
-         word_it.forward()) {
+    for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {
      word = word_it.data();
      if (!word->part_of_combo) {
        box = word->word->bounding_box();
@ -461,30 +433,26 @@ void transform_to_next_perm(WERD_RES_LIST &words) {
            combo->done = false;
            combo->ClearResults();
          } else {
-            prev_word_it = word_it;  // catch up
+            prev_word_it = word_it; // catch up
          }
        }
        prev_right = box.right();
      }
    }
  } else {
-    words.clear();  // signal termination
+    words.clear(); // signal termination
  }
 }

-namespace tesseract {
-void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
-                           int16_t mode, bool improved) {
+void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {
  WERD_RES_IT word_res_it(&perm);

  if (debug_fix_space_level > 0) {
    if (mode == 1) {
      stats_.dump_words_str = "";
-      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
-           word_res_it.forward()) {
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
        if (!word_res_it.data()->part_of_combo) {
-          stats_.dump_words_str +=
-              word_res_it.data()->best_choice->unichar_string();
+          stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
          stats_.dump_words_str += ' ';
        }
      }
@ -503,22 +471,18 @@ void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
          break;
      }

-      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
-           word_res_it.forward()) {
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
        if (!word_res_it.data()->part_of_combo) {
-          tprintf("%s/%1d ",
-                  word_res_it.data()->best_choice->unichar_string().c_str(),
+          tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
                  static_cast<int>(word_res_it.data()->best_choice->permuter()));
        }
      }
      tprintf("\"\n");
    } else if (improved) {
      tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
-      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
-           word_res_it.forward()) {
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
        if (!word_res_it.data()->part_of_combo) {
-          tprintf("%s/%1d ",
-                  word_res_it.data()->best_choice->unichar_string().c_str(),
+          tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
                  static_cast<int>(word_res_it.data()->best_choice->permuter()));
        }
      }
@ -532,13 +496,12 @@ bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
    return true;

  /*
-    Use all the standard pass 2 conditions for mode 5 in set_done() in
-    reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
-    CARE WHETHER WE HAVE of/at on/an etc.
-  */
+  Use all the standard pass 2 conditions for mode 5 in set_done() in
+  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
+  CARE WHETHER WE HAVE of/at on/an etc.
+*/
  if (fixsp_done_mode > 0 &&
-      (word->tess_accepted ||
-       (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
+      (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
       fixsp_done_mode == 3) &&
      (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
      ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
@ -551,7 +514,6 @@ bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
  }
 }

-
 /**
 * @name fix_sp_fp_word()
 * Test the current word to see if it can be split by deleting noise blobs. If
@ -559,8 +521,7 @@ bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
 * Return with the iterator pointing to the same place if the word is unchanged,
 * or the last of the replacement words.
 */
-void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
-                               BLOCK* block) {
+void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {
  WERD_RES *word_res;
  WERD_RES_LIST sub_word_list;
  WERD_RES_IT sub_word_list_it(&sub_word_list);
@ -569,9 +530,7 @@ void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
  float junk;

  word_res = word_res_it.data();
-  if (word_res->word->flag(W_REP_CHAR) ||
-      word_res->combination ||
-      word_res->part_of_combo ||
+  if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
      !word_res->word->flag(W_DONT_CHOP))
    return;

@ -580,8 +539,7 @@ void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
    return;

  if (debug_fix_space_level > 1) {
-    tprintf("FP fixspace working on \"%s\"\n",
-            word_res->best_choice->unichar_string().c_str());
+    tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
  }
  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
  sub_word_list_it.add_after_stay_put(word_res_it.extract());
@ -593,8 +551,7 @@ void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
  }
 }

-void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
-                                     BLOCK* block) {
+void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
  int16_t best_score;
  WERD_RES_IT best_perm_it(&best_perm);
  WERD_RES_LIST current_perm;
@ -603,16 +560,16 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
  int16_t current_score;
  bool improved = false;

-  best_score = fp_eval_word_spacing(best_perm);  // default score
+  best_score = fp_eval_word_spacing(best_perm); // default score

  dump_words(best_perm, best_score, 1, improved);

  old_word_res = best_perm_it.data();
  // Even deep_copy doesn't copy the underlying WERD unless its combination
  // flag is true!.
-  old_word_res->combination = true;   // Kludge to force deep copy
+  old_word_res->combination = true; // Kludge to force deep copy
  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
-  old_word_res->combination = false;  // Undo kludge
+  old_word_res->combination = false; // Undo kludge

  break_noisiest_blob_word(current_perm);

@ -633,7 +590,6 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
  dump_words(best_perm, best_score, 3, improved);
 }

-
 /**
 * break_noisiest_blob_word()
 * Find the word with the blob which looks like the worst noise.
@ -643,9 +599,9 @@ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
  WERD_RES_IT word_it(&words);
  WERD_RES_IT worst_word_it;
  float worst_noise_score = 9999;
-  int worst_blob_index = -1;     // Noisiest blob of noisiest wd
-  int blob_index;                // of wds noisiest blob
-  float noise_score;             // of wds noisiest blob
+  int worst_blob_index = -1; // Noisiest blob of noisiest wd
+  int blob_index;            // of wds noisiest blob
+  float noise_score;         // of wds noisiest blob
  WERD_RES *word_res;
  C_BLOB_IT blob_it;
  C_BLOB_IT rej_cblob_it;
@ -665,7 +621,7 @@ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
    }
  }
  if (worst_blob_index < 0) {
-    words.clear();          // signal termination
+    words.clear(); // signal termination
    return;
  }

@ -681,59 +637,57 @@ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
    new_blob_it.add_after_then_move(blob_it.extract());
  }
  start_of_noise_blob = blob_it.data()->bounding_box().left();
-  delete blob_it.extract();     // throw out noise blob
+  delete blob_it.extract(); // throw out noise blob

  new_word = new WERD(&new_blob_list, word_res->word);
  new_word->set_flag(W_EOL, false);
  word_res->word->set_flag(W_BOL, false);
-  word_res->word->set_blanks(1);  // After break
+  word_res->word->set_blanks(1); // After break

  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
-  for (;
-       (!rej_cblob_it.empty() &&
-        (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
+  for (; (!rej_cblob_it.empty() &&
+          (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
       rej_cblob_it.forward()) {
    new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
  }

-  auto* new_word_res = new WERD_RES(new_word);
+  auto *new_word_res = new WERD_RES(new_word);
  new_word_res->combination = true;
  worst_word_it.add_before_then_move(new_word_res);

  word_res->ClearResults();
 }

-int16_t Tesseract::worst_noise_blob(WERD_RES *word_res,
-                                  float *worst_noise_score) {
+int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
  float noise_score[512];
  int i;
-  int min_noise_blob;            // 1st contender
-  int max_noise_blob;            // last contender
+  int min_noise_blob; // 1st contender
+  int max_noise_blob; // last contender
  int non_noise_count;
-  int worst_noise_blob;          // Worst blob
+  int worst_noise_blob; // Worst blob
  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
  float non_noise_limit = kBlnXHeight * 0.8;

  if (word_res->rebuild_word == nullptr)
-    return -1;  // Can't handle cube words.
+    return -1; // Can't handle cube words.

  // Normalised.
  int blob_count = word_res->box_word->length();
  ASSERT_HOST(blob_count <= 512);
  if (blob_count < 5)
-    return -1;                   // too short to split
+    return -1; // too short to split

-  /* Get the noise scores for all blobs */
+    /* Get the noise scores for all blobs */

-  #ifndef SECURE_NAMES
+#ifndef SECURE_NAMES
  if (debug_fix_space_level > 5)
    tprintf("FP fixspace Noise metrics for \"%s\": ",
            word_res->best_choice->unichar_string().c_str());
-  #endif
+#endif

  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
-    TBLOB* blob = word_res->rebuild_word->blobs[i];
+    TBLOB *blob = word_res->rebuild_word->blobs[i];
    if (word_res->reject_map[i].accepted())
      noise_score[i] = non_noise_limit;
    else
@ -759,8 +713,7 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res,
  min_noise_blob = i;

  non_noise_count = 0;
-  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
-       i--) {
+  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
    if (noise_score[i] >= non_noise_limit) {
      non_noise_count++;
    }
@ -785,12 +738,12 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res,
 }

 float Tesseract::blob_noise_score(TBLOB *blob) {
-  TBOX box;                       // BB of outline
+  TBOX box; // BB of outline
  int16_t outline_count = 0;
  int16_t max_dimension;
  int16_t largest_outline_dimension = 0;

-  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
+  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
    outline_count++;
    box = ol->bounding_box();
    if (box.height() > box.width()) {
@ -809,15 +762,13 @@ float Tesseract::blob_noise_score(TBLOB *blob) {
  }

  box = blob->bounding_box();
-  if (box.bottom() > kBlnBaselineOffset * 4 ||
-      box.top() < kBlnBaselineOffset / 2) {
+  if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
    // Lax blob is if high or low
    largest_outline_dimension /= 2;
  }

  return largest_outline_dimension;
 }
-}  // namespace tesseract

 void fixspace_dbg(WERD_RES *word) {
  TBOX box = word->word->bounding_box();
@ -826,10 +777,8 @@ void fixspace_dbg(WERD_RES *word) {

  box.print();
  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
-  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
-          word->word->cblob_list()->length(),
-          word->rebuild_word->NumBlobs(),
-          word->box_word->length());
+  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(),
+          word->rebuild_word->NumBlobs(), word->box_word->length());
  word->reject_map.print(debug_fp);
  tprintf("\n");
  if (show_map_detail) {
@ -844,7 +793,6 @@ void fixspace_dbg(WERD_RES *word) {
  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
 }

-
 /**
 * fp_eval_word_spacing()
 * Evaluation function for fixed pitch word lists.
@ -853,7 +801,6 @@ void fixspace_dbg(WERD_RES *word) {
 * acceptable words or in dict words and are not rejected.
 * Penalise any potential noise chars
 */
-namespace tesseract {
 int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
  WERD_RES_IT word_it(&word_res_list);
  WERD_RES *word;
@ -864,20 +811,16 @@ int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    word = word_it.data();
    if (word->rebuild_word == nullptr)
-      continue;  // Can't handle cube words.
-    if (word->done ||
-        word->tess_accepted ||
-        word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+      continue; // Can't handle cube words.
+    if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
        word->best_choice->permuter() == FREQ_DAWG_PERM ||
-        word->best_choice->permuter() == USER_DAWG_PERM ||
-        safe_dict_word(word) > 0) {
+        word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
      int num_blobs = word->rebuild_word->NumBlobs();
      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
      for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
-        TBLOB* blob = word->rebuild_word->blobs[i];
-        if (word->best_choice->unichar_id(i) == space ||
-            blob_noise_score(blob) < small_limit) {
-          score -= 1;  // penalise possibly erroneous non-space
+        TBLOB *blob = word->rebuild_word->blobs[i];
+        if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
+          score -= 1; // penalise possibly erroneous non-space
        } else if (word->reject_map[i].accepted()) {
          score++;
        }
@ -889,4 +832,4 @@ int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
  return score;
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/ccmain/fixspace.h
+++ b/src/ccmain/fixspace.h
@ -22,6 +22,8 @@
 #ifndef FIXSPACE_H
 #define FIXSPACE_H

+namespace tesseract {
+
 class WERD_RES;
 class WERD_RES_LIST;

@ -29,4 +31,6 @@ void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);
 void transform_to_next_perm(WERD_RES_LIST &words);
 void fixspace_dbg(WERD_RES *word);

+} // namespace tesseract
+
 #endif
--- a/src/ccmain/fixxht.cpp
+++ b/src/ccmain/fixxht.cpp
@ -17,13 +17,14 @@
 *
 **********************************************************************/

-#include <algorithm>
-#include <cstring>
-#include <cctype>
-#include "params.h"
 #include "float2int.h"
+#include "params.h"
 #include "tesseractclass.h"

+#include <algorithm>
+#include <cctype>
+#include <cstring>
+
 namespace tesseract {

 // Fixxht overview.
@ -71,25 +72,23 @@ int Tesseract::CountMisfitTops(WERD_RES *word_res) {
  int bad_blobs = 0;
  int num_blobs = word_res->rebuild_word->NumBlobs();
  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
-    TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
+    TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
      int top = blob->bounding_box().top();
      if (top >= INT_FEAT_RANGE)
        top = INT_FEAT_RANGE - 1;
      int min_bottom, max_bottom, min_top, max_top;
-      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
-                                &min_top, &max_top);
+      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
      if (max_top - min_top > kMaxCharTopRange)
        continue;
-      bool bad =  top < min_top - x_ht_acceptance_tolerance ||
-                  top > max_top + x_ht_acceptance_tolerance;
+      bool bad =
+          top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;
      if (bad)
        ++bad_blobs;
      if (debug_x_ht_level >= 1) {
        tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
-                unicharset.id_to_unichar(class_id),
-                bad ? "Misfit" : "OK", top, min_top, max_top,
+                unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
                static_cast<int>(x_ht_acceptance_tolerance));
      }
    }
@ -99,8 +98,7 @@ int Tesseract::CountMisfitTops(WERD_RES *word_res) {

 // Returns a new x-height maximally compatible with the result in word_res.
 // See comment above for overall algorithm.
-float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
-                                          float* baseline_shift) {
+float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) {
  STATS top_stats(0, UINT8_MAX);
  STATS shift_stats(-UINT8_MAX, UINT8_MAX);
  int bottom_shift = 0;
@ -109,43 +107,36 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
    top_stats.clear();
    shift_stats.clear();
    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
-      TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
+      TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
-      if (unicharset.get_isalpha(class_id) ||
-          unicharset.get_isdigit(class_id)) {
+      if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
        int top = blob->bounding_box().top() + bottom_shift;
        // Clip the top to the limit of normalized feature space.
        if (top >= INT_FEAT_RANGE)
          top = INT_FEAT_RANGE - 1;
        int bottom = blob->bounding_box().bottom() + bottom_shift;
        int min_bottom, max_bottom, min_top, max_top;
-        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
-                                  &min_top, &max_top);
+        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
        // Chars with a wild top range would mess up the result so ignore them.
        if (max_top - min_top > kMaxCharTopRange)
          continue;
        int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
-                            top - (max_top + x_ht_acceptance_tolerance));
+                                   top - (max_top + x_ht_acceptance_tolerance));
        int height = top - kBlnBaselineOffset;
        if (debug_x_ht_level >= 2) {
          tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
-                  unicharset.id_to_unichar(class_id),
-                  height, min_bottom, max_bottom, min_top, max_top,
-                  bottom, top);
+                  unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
+                  max_top, bottom, top);
        }
        // Use only chars that fit in the expected bottom range, and where
        // the range of tops is sensibly near the xheight.
        if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
-            bottom - x_ht_acceptance_tolerance <= max_bottom &&
-            min_top > kBlnBaselineOffset &&
-            max_top - kBlnBaselineOffset >= kBlnXHeight &&
-            misfit_dist > 0) {
+            bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
+            max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
          // Compute the x-height position using proportionality between the
          // actual height and expected height.
-          int min_xht = DivRounded(height * kBlnXHeight,
-                                   max_top - kBlnBaselineOffset);
-          int max_xht = DivRounded(height * kBlnXHeight,
-                                   min_top - kBlnBaselineOffset);
+          int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
+          int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
          if (debug_x_ht_level >= 2) {
            tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
          }
@ -188,8 +179,7 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
        tprintf("Applying bottom shift=%d\n", bottom_shift);
      }
    }
-  } while (bottom_shift != 0 &&
-           top_stats.get_total() < shift_stats.get_total());
+  } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
  // Baseline shift is opposite sign to the bottom shift.
  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
  if (debug_x_ht_level >= 2) {
@ -202,8 +192,8 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
  float new_xht = top_stats.median();
  if (debug_x_ht_level >= 2) {
    tprintf("Median xht=%f\n", new_xht);
-    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
-            new_xht, new_xht / word_res->denorm.y_scale());
+    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
+            new_xht / word_res->denorm.y_scale());
  }
  // The xheight must change by at least x_ht_min_change to be used.
  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
@ -212,4 +202,4 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
    return bottom_shift != 0 ? word_res->x_height : 0.0f;
 }

-}  // namespace tesseract
+} // namespace tesseract
--- a/src/ccmain/linerec.cpp
+++ b/src/ccmain/linerec.cpp
@ -17,14 +17,12 @@

 #include "tesseractclass.h"

-#include "allheaders.h"
+#include <allheaders.h>
 #include "boxread.h"
-#include "imagedata.h"
-#ifndef ANDROID_BUILD
+#include "imagedata.h" // for ImageData
 #include "lstmrecognizer.h"
-#include "recodebeam.h"
-#endif
 #include "pageres.h"
+#include "recodebeam.h"
 #include "tprintf.h"

 #include <algorithm>
@ -40,10 +38,9 @@ const float kWorstDictCertainty = -25.0f;
 // Breaks the page into lines, according to the boxes, and writes them to a
 // serialized DocumentData based on output_basename.
 // Return true if successful, false if an error occurred.
-bool Tesseract::TrainLineRecognizer(const STRING& input_imagename,
-                                    const STRING& output_basename,
+bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
                                    BLOCK_LIST *block_list) {
-  STRING lstmf_name = output_basename + ".lstmf";
+  std::string lstmf_name = output_basename + ".lstmf";
  DocumentData images(lstmf_name);
  if (applybox_page > 0) {
    // Load existing document for the previous pages.
@ -52,18 +49,17 @@ bool Tesseract::TrainLineRecognizer(const STRING& input_imagename,
      return false;
    }
  }
-  GenericVector<TBOX> boxes;
-  GenericVector<STRING> texts;
+  std::vector<TBOX> boxes;
+  std::vector<std::string> texts;
  // Get the boxes for this page, if there are any.
-  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
-                    nullptr) ||
+  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
      boxes.empty()) {
-    tprintf("Failed to read boxes from %s\n", input_imagename.c_str());
+    tprintf("Failed to read boxes from %s\n", input_imagename);
    return false;
  }
  TrainFromBoxes(boxes, texts, block_list, &images);
  if (images.PagesSize() == 0) {
-    tprintf("Failed to read pages from %s\n", input_imagename.c_str());
+    tprintf("Failed to read pages from %s\n", input_imagename);
    return false;
  }
  images.Shuffle();
@ -77,33 +73,31 @@ bool Tesseract::TrainLineRecognizer(const STRING& input_imagename,
 // Generates training data for training a line recognizer, eg LSTM.
 // Breaks the boxes into lines, normalizes them, converts to ImageData and
 // appends them to the given training_data.
-void Tesseract::TrainFromBoxes(const GenericVector<TBOX>& boxes,
-                               const GenericVector<STRING>& texts,
-                               BLOCK_LIST *block_list,
-                               DocumentData* training_data) {
+void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
+                               BLOCK_LIST *block_list, DocumentData *training_data) {
  int box_count = boxes.size();
  // Process all the text lines in this page, as defined by the boxes.
  int end_box = 0;
  // Don't let \t, which marks newlines in the box file, get into the line
  // content, as that makes the line unusable in training.
-  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
+  while (end_box < texts.size() && texts[end_box] == "\t")
+    ++end_box;
  for (int start_box = end_box; start_box < box_count; start_box = end_box) {
    // Find the textline of boxes starting at start and their bounding box.
    TBOX line_box = boxes[start_box];
-    STRING line_str = texts[start_box];
-    for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
-         ++end_box) {
+    std::string line_str = texts[start_box];
+    for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
      line_box += boxes[end_box];
      line_str += texts[end_box];
    }
    // Find the most overlapping block.
-    BLOCK* best_block = nullptr;
+    BLOCK *best_block = nullptr;
    int best_overlap = 0;
    BLOCK_IT b_it(block_list);
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
-      BLOCK* block = b_it.data();
+      BLOCK *block = b_it.data();
      if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
-        continue;  // Not a text block.
+        continue; // Not a text block.
      TBOX block_box = block->pdblk.bounding_box();
      block_box.rotate(block->re_rotation());
      if (block_box.major_overlap(line_box)) {
@ -114,39 +108,37 @@ void Tesseract::TrainFromBoxes(const GenericVector<TBOX>& boxes,
        }
      }
    }
-    ImageData* imagedata = nullptr;
+    ImageData *imagedata = nullptr;
    if (best_block == nullptr) {
      tprintf("No block overlapping textline: %s\n", line_str.c_str());
    } else {
-      imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
-                              *best_block);
+      imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
    }
    if (imagedata != nullptr)
      training_data->AddPageToDocument(imagedata);
    // Don't let \t, which marks newlines in the box file, get into the line
    // content, as that makes the line unusable in training.
-    while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
+    while (end_box < texts.size() && texts[end_box] == "\t")
+      ++end_box;
  }
 }

 // Returns an Imagedata containing the image of the given box,
 // and ground truth boxes/truth text if available in the input.
 // The image is not normalized in any way.
-ImageData* Tesseract::GetLineData(const TBOX& line_box,
-                                  const GenericVector<TBOX>& boxes,
-                                  const GenericVector<STRING>& texts,
-                                  int start_box, int end_box,
-                                  const BLOCK& block) {
+ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
+                                  const std::vector<std::string> &texts, int start_box, int end_box,
+                                  const BLOCK &block) {
  TBOX revised_box;
-  ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
-                                       &revised_box);
-  if (image_data == nullptr) return nullptr;
+  ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
+  if (image_data == nullptr)
+    return nullptr;
  image_data->set_page_number(applybox_page);
  // Copy the boxes and shift them so they are relative to the image.
  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
  ICOORD shift = -revised_box.botleft();
-  GenericVector<TBOX> line_boxes;
-  GenericVector<STRING> line_texts;
+  std::vector<TBOX> line_boxes;
+  std::vector<std::string> line_texts;
  for (int b = start_box; b < end_box; ++b) {
    TBOX box = boxes[b];
    box.rotate(block_rotation);
@ -154,8 +146,8 @@ ImageData* Tesseract::GetLineData(const TBOX& line_box,
    line_boxes.push_back(box);
    line_texts.push_back(texts[b]);
  }
-  GenericVector<int> page_numbers;
-  page_numbers.init_to_size(line_boxes.size(), applybox_page);
+  std::vector<int> page_numbers;
+  page_numbers.resize(line_boxes.size(), applybox_page);
  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
  return image_data;
 }
@ -166,8 +158,8 @@ ImageData* Tesseract::GetLineData(const TBOX& line_box,
 // is set in the returned ImageData if the text was originally vertical, which
 // can be used to invoke a different CJK recognition engine. The revised_box
 // is also returned to enable calculation of output bounding boxes.
-ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
-                                   int padding, TBOX* revised_box) const {
+ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,
+                                   TBOX *revised_box) const {
  TBOX wbox = box;
  wbox.pad(padding, padding);
  *revised_box = wbox;
@ -186,27 +178,29 @@ ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
    revised_box->rotate(block.re_rotation());
  // Now revised_box always refers to the image.
  // BestPix is never colormapped, but may be of any depth.
-  Pix* pix = BestPix();
+  Pix *pix = BestPix();
  int width = pixGetWidth(pix);
  int height = pixGetHeight(pix);
  TBOX image_box(0, 0, width, height);
  // Clip to image bounds;
  *revised_box &= image_box;
-  if (revised_box->null_box()) return nullptr;
-  Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
-                            revised_box->width(), revised_box->height());
-  Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
-  if (box_pix == nullptr) return nullptr;
+  if (revised_box->null_box())
+    return nullptr;
+  Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
+                            revised_box->height());
+  Pix *box_pix = pixClipRectangle(pix, clip_box, nullptr);
  boxDestroy(&clip_box);
+  if (box_pix == nullptr)
+    return nullptr;
  if (num_rotations > 0) {
-    Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
+    Pix *rot_pix = pixRotateOrth(box_pix, num_rotations);
    pixDestroy(&box_pix);
    box_pix = rot_pix;
  }
  // Convert sub-8-bit images to 8 bit.
  int depth = pixGetDepth(box_pix);
  if (depth < 8) {
-    Pix* grey;
+    Pix *grey;
    grey = pixConvertTo8(box_pix, false);
    pixDestroy(&box_pix);
    box_pix = grey;
@ -222,15 +216,13 @@ ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
  return new ImageData(vertical_text, box_pix);
 }

-#ifndef ANDROID_BUILD
 // Recognizes a word or group of words, converting to WERD_RES in *words.
 // Analogous to classify_word_pass1, but can handle a group of words as well.
-void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
-                                  PointerVector<WERD_RES>* words) {
+void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
+                                  PointerVector<WERD_RES> *words) {
  TBOX word_box = word->word->bounding_box();
  // Get the word image - no frills.
-  if (tessedit_pageseg_mode == PSM_SINGLE_WORD ||
-      tessedit_pageseg_mode == PSM_RAW_LINE) {
+  if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
    // In single word mode, use the whole image without any other row/word
    // interpretation.
    word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
@ -241,14 +233,14 @@ void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
    if (baseline + row->x_height() + row->ascenders() > word_box.top())
      word_box.set_top(baseline + row->x_height() + row->ascenders());
  }
-  ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
-  if (im_data == nullptr) return;
+  ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
+  if (im_data == nullptr)
+    return;

  bool do_invert = tessedit_do_invert;
  lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
-                                  kWorstDictCertainty / kCertaintyScale,
-                                  word_box, words, lstm_choice_mode,
-                                  lstm_choice_iterations);
+                                  kWorstDictCertainty / kCertaintyScale, word_box, words,
+                                  lstm_choice_mode, lstm_choice_iterations);
  delete im_data;
  SearchWords(words);
 }
@ -256,24 +248,24 @@ void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
 // Apply segmentation search to the given set of words, within the constraints
 // of the existing ratings matrix. If there is already a best_choice on a word
 // leaves it untouched and just sets the done/accepted etc flags.
-void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
+void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
  // Run the segmentation search on the network outputs and make a BoxWord
  // for each of the output words.
  // If we drop a word as junk, then there is always a space in front of the
  // next.
-  const Dict* stopper_dict = lstm_recognizer_->GetDict();
-  if (stopper_dict == nullptr) stopper_dict = &getDict();
+  const Dict *stopper_dict = lstm_recognizer_->GetDict();
+  if (stopper_dict == nullptr)
+    stopper_dict = &getDict();
  bool any_nonspace_delimited = false;
  for (int w = 0; w < words->size(); ++w) {
-    WERD_RES* word = (*words)[w];
-    if (word->best_choice != nullptr &&
-        word->best_choice->ContainsAnyNonSpaceDelimited()) {
+    WERD_RES *word = (*words)[w];
+    if (word->best_choice != nullptr && word->best_choice->ContainsAnyNonSpaceDelimited()) {
      any_nonspace_delimited = true;
      break;
    }
  }
  for (int w = 0; w < words->size(); ++w) {
-    WERD_RES* word = (*words)[w];
+    WERD_RES *word = (*words)[w];
    if (word->best_choice == nullptr) {
      // It is a dud.
      word->SetupFake(lstm_recognizer_->GetUnicharset());
@ -289,14 +281,12 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
      word->tess_would_adapt = false;
      word->done = true;
      word->tesseract = this;
-      float word_certainty = std::min(word->space_certainty,
-                                 word->best_choice->certainty());
+      float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
      word_certainty *= kCertaintyScale;
      if (getDict().stopper_debug_level >= 1) {
        tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
                word->best_choice->certainty(), word->space_certainty,
-                std::min(word->space_certainty, word->best_choice->certainty()) *
-                    kCertaintyScale,
+                std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
                word_certainty);
        word->best_choice->print();
      }
@ -306,6 +296,5 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
    }
  }
 }
-#endif  // ANDROID_BUILD

-}  // namespace tesseract.
+} // namespace tesseract.
--- a/src/ccmain/ltrresultiterator.cpp
+++ b/src/ccmain/ltrresultiterator.cpp
@ -19,21 +19,20 @@

 #include <tesseract/ltrresultiterator.h>

-#include "allheaders.h"
 #include "pageres.h"
-#include <tesseract/strngs.h>
 #include "tesseractclass.h"

+#include <allheaders.h>
+
 namespace tesseract {

-LTRResultIterator::LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
-                                     int scale, int scaled_yres, int rect_left,
-                                     int rect_top, int rect_width,
+LTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
+                                     int scaled_yres, int rect_left, int rect_top, int rect_width,
                                     int rect_height)
-    : PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top,
-                   rect_width, rect_height),
-      line_separator_("\n"),
-      paragraph_separator_("\n") {}
+    : PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top, rect_width,
+                   rect_height)
+    , line_separator_("\n")
+    , paragraph_separator_("\n") {}

 // Destructor.
 // It is defined here, so the compiler can create a single vtable
@ -42,23 +41,23 @@ LTRResultIterator::~LTRResultIterator() = default;

 // Returns the null terminated UTF-8 encoded text string for the current
 // object at the given level. Use delete [] to free after use.
-char* LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
  if (it_->word() == nullptr)
-    return nullptr;  // Already at the end!
-  STRING text;
+    return nullptr; // Already at the end!
+  std::string text;
  PAGE_RES_IT res_it(*it_);
-  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  WERD_CHOICE *best_choice = res_it.word()->best_choice;
  ASSERT_HOST(best_choice != nullptr);
  if (level == RIL_SYMBOL) {
    text = res_it.word()->BestUTF8(blob_index_, false);
  } else if (level == RIL_WORD) {
    text = best_choice->unichar_string();
  } else {
-    bool eol = false;  // end of line?
-    bool eop = false;  // end of paragraph?
-    do {               // for each paragraph in a block
-      do {             // for each text line in a paragraph
-        do {           // for each word in a text line
+    bool eol = false; // end of line?
+    bool eop = false; // end of paragraph?
+    do {              // for each paragraph in a block
+      do {            // for each text line in a paragraph
+        do {          // for each word in a text line
          best_choice = res_it.word()->best_choice;
          ASSERT_HOST(best_choice != nullptr);
          text += best_choice->unichar_string();
@ -66,7 +65,7 @@ char* LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
          res_it.forward();
          eol = res_it.row() != res_it.prev_row();
        } while (!eol);
-        text.truncate_at(text.length() - 1);
+        text.resize(text.length() - 1);
        text += line_separator_;
        eop = res_it.block() != res_it.prev_block() ||
              res_it.row()->row->para() != res_it.prev_row()->row->para();
@ -76,18 +75,18 @@ char* LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
    } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
  }
  int length = text.length() + 1;
-  char* result = new char[length];
+  char *result = new char[length];
  strncpy(result, text.c_str(), length);
  return result;
 }

 // Set the string inserted at the end of each text line. "\n" by default.
-void LTRResultIterator::SetLineSeparator(const char* new_line) {
+void LTRResultIterator::SetLineSeparator(const char *new_line) {
  line_separator_ = new_line;
 }

 // Set the string inserted at the end of each paragraph. "\n" by default.
-void LTRResultIterator::SetParagraphSeparator(const char* new_para) {
+void LTRResultIterator::SetParagraphSeparator(const char *new_para) {
  paragraph_separator_ = new_para;
 }

@ -95,11 +94,11 @@ void LTRResultIterator::SetParagraphSeparator(const char* new_para) {
 // The number should be interpreted as a percent probability. (0.0f-100.0f)
 float LTRResultIterator::Confidence(PageIteratorLevel level) const {
  if (it_->word() == nullptr)
-    return 0.0f;  // Already at the end!
+    return 0.0f; // Already at the end!
  float mean_certainty = 0.0f;
  int certainty_count = 0;
  PAGE_RES_IT res_it(*it_);
-  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  WERD_CHOICE *best_choice = res_it.word()->best_choice;
  ASSERT_HOST(best_choice != nullptr);
  switch (level) {
    case RIL_BLOCK:
@ -145,10 +144,10 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const {
  return 0.0f;
 }

-void LTRResultIterator::RowAttributes(float* row_height, float* descenders,
-                                      float* ascenders) const {
-  *row_height = it_->row()->row->x_height() + it_->row()->row->ascenders() -
-                it_->row()->row->descenders();
+void LTRResultIterator::RowAttributes(float *row_height, float *descenders,
+                                      float *ascenders) const {
+  *row_height =
+      it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
  *descenders = it_->row()->row->descenders();
  *ascenders = it_->row()->row->ascenders();
 }
@ -161,37 +160,35 @@ void LTRResultIterator::RowAttributes(float* row_height, float* descenders,
 // the iterator itself, ie rendered invalid by various members of
 // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
 // Pointsize is returned in printers points (1/72 inch.)
-const char* LTRResultIterator::WordFontAttributes(
-    bool* is_bold, bool* is_italic, bool* is_underlined, bool* is_monospace,
-    bool* is_serif, bool* is_smallcaps, int* pointsize, int* font_id) const {
-  const char* result = nullptr;
+const char *LTRResultIterator::WordFontAttributes(bool *is_bold, bool *is_italic,
+                                                  bool *is_underlined, bool *is_monospace,
+                                                  bool *is_serif, bool *is_smallcaps,
+                                                  int *pointsize, int *font_id) const {
+  const char *result = nullptr;

  if (it_->word() == nullptr) {
    // Already at the end!
    *pointsize = 0;
  } else {
-    float row_height = it_->row()->row->x_height() +
-                       it_->row()->row->ascenders() -
-                       it_->row()->row->descenders();
+    float row_height =
+        it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
    // Convert from pixels to printers points.
    *pointsize =
-        scaled_yres_ > 0
-            ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
-            : 0;
+        scaled_yres_ > 0 ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5) : 0;

-    #ifndef DISABLED_LEGACY_ENGINE
-    const FontInfo* font_info = it_->word()->fontinfo;
+#ifndef DISABLED_LEGACY_ENGINE
+    const FontInfo *font_info = it_->word()->fontinfo;
    if (font_info) {
      // Font information available.
      *font_id = font_info->universal_id;
      *is_bold = font_info->is_bold();
      *is_italic = font_info->is_italic();
-      *is_underlined = false;  // TODO(rays) fix this!
+      *is_underlined = false; // TODO(rays) fix this!
      *is_monospace = font_info->is_fixed_pitch();
      *is_serif = font_info->is_serif();
      result = font_info->name;
    }
-    #endif  // ndef DISABLED_LEGACY_ENGINE
+#endif // ndef DISABLED_LEGACY_ENGINE

    *is_smallcaps = it_->word()->small_caps;
  }
@ -210,7 +207,7 @@ const char* LTRResultIterator::WordFontAttributes(
 }

 // Returns the name of the language used to recognize this word.
-const char* LTRResultIterator::WordRecognitionLanguage() const {
+const char *LTRResultIterator::WordRecognitionLanguage() const {
  if (it_->word() == nullptr || it_->word()->tesseract == nullptr)
    return nullptr;
  return it_->word()->tesseract->lang.c_str();
@ -234,10 +231,9 @@ StrongScriptDirection LTRResultIterator::WordDirection() const {
 // Returns true if the current word was found in a dictionary.
 bool LTRResultIterator::WordIsFromDictionary() const {
  if (it_->word() == nullptr)
-    return false;  // Already at the end!
+    return false; // Already at the end!
  int permuter = it_->word()->best_choice->permuter();
-  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
-         permuter == USER_DAWG_PERM;
+  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || permuter == USER_DAWG_PERM;
 }

 // Returns the number of blanks before the current word.
@ -250,7 +246,7 @@ int LTRResultIterator::BlanksBeforeWord() const {
 // Returns true if the current word is numeric.
 bool LTRResultIterator::WordIsNumeric() const {
  if (it_->word() == nullptr)
-    return false;  // Already at the end!
+    return false; // Already at the end!
  int permuter = it_->word()->best_choice->permuter();
  return permuter == NUMBER_PERM;
 }
@ -264,39 +260,38 @@ bool LTRResultIterator::HasBlamerInfo() const {
 #ifndef DISABLED_LEGACY_ENGINE
 // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
 // of the current word.
-const void* LTRResultIterator::GetParamsTrainingBundle() const {
+const void *LTRResultIterator::GetParamsTrainingBundle() const {
  return (it_->word() != nullptr && it_->word()->blamer_bundle != nullptr)
             ? &(it_->word()->blamer_bundle->params_training_bundle())
             : nullptr;
 }
-#endif  // ndef DISABLED_LEGACY_ENGINE
+#endif // ndef DISABLED_LEGACY_ENGINE

 // Returns the pointer to the string with blamer information for this word.
 // Assumes that the word's blamer_bundle is not nullptr.
-const char* LTRResultIterator::GetBlamerDebug() const {
+const char *LTRResultIterator::GetBlamerDebug() const {
  return it_->word()->blamer_bundle->debug().c_str();
 }

 // Returns the pointer to the string with misadaption information for this word.
 // Assumes that the word's blamer_bundle is not nullptr.
-const char* LTRResultIterator::GetBlamerMisadaptionDebug() const {
+const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
  return it_->word()->blamer_bundle->misadaption_debug().c_str();
 }

 // Returns true if a truth string was recorded for the current word.
 bool LTRResultIterator::HasTruthString() const {
  if (it_->word() == nullptr)
-    return false;  // Already at the end!
-  if (it_->word()->blamer_bundle == nullptr ||
-      it_->word()->blamer_bundle->NoTruth()) {
-    return false;  // no truth information for this word
+    return false; // Already at the end!
+  if (it_->word()->blamer_bundle == nullptr || it_->word()->blamer_bundle->NoTruth()) {
+    return false; // no truth information for this word
  }
  return true;
 }

 // Returns true if the given string is equivalent to the truth string for
 // the current word.
-bool LTRResultIterator::EquivalentToTruth(const char* str) const {
+bool LTRResultIterator::EquivalentToTruth(const char *str) const {
  if (!HasTruthString())
    return false;
  ASSERT_HOST(it_->word()->uch_set != nullptr);
@ -306,39 +301,39 @@ bool LTRResultIterator::EquivalentToTruth(const char* str) const {

 // Returns the null terminated UTF-8 encoded truth string for the current word.
 // Use delete [] to free after use.
-char* LTRResultIterator::WordTruthUTF8Text() const {
+char *LTRResultIterator::WordTruthUTF8Text() const {
  if (!HasTruthString())
    return nullptr;
-  STRING truth_text = it_->word()->blamer_bundle->TruthString();
+  std::string truth_text = it_->word()->blamer_bundle->TruthString();
  int length = truth_text.length() + 1;
-  char* result = new char[length];
+  char *result = new char[length];
  strncpy(result, truth_text.c_str(), length);
  return result;
 }

 // Returns the null terminated UTF-8 encoded normalized OCR string for the
 // current word. Use delete [] to free after use.
-char* LTRResultIterator::WordNormedUTF8Text() const {
+char *LTRResultIterator::WordNormedUTF8Text() const {
  if (it_->word() == nullptr)
-    return nullptr;  // Already at the end!
-  STRING ocr_text;
-  WERD_CHOICE* best_choice = it_->word()->best_choice;
-  const UNICHARSET* unicharset = it_->word()->uch_set;
+    return nullptr; // Already at the end!
+  std::string ocr_text;
+  WERD_CHOICE *best_choice = it_->word()->best_choice;
+  const UNICHARSET *unicharset = it_->word()->uch_set;
  ASSERT_HOST(best_choice != nullptr);
  for (int i = 0; i < best_choice->length(); ++i) {
    ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
  }
  int length = ocr_text.length() + 1;
-  char* result = new char[length];
+  char *result = new char[length];
  strncpy(result, ocr_text.c_str(), length);
  return result;
 }

 // Returns a pointer to serialized choice lattice.
 // Fills lattice_size with the number of bytes in lattice data.
-const char* LTRResultIterator::WordLattice(int* lattice_size) const {
+const char *LTRResultIterator::WordLattice(int *lattice_size) const {
  if (it_->word() == nullptr)
-    return nullptr;  // Already at the end!
+    return nullptr; // Already at the end!
  if (it_->word()->blamer_bundle == nullptr)
    return nullptr;
  *lattice_size = it_->word()->blamer_bundle->lattice_size();
@ -350,8 +345,7 @@ const char* LTRResultIterator::WordLattice(int* lattice_size) const {
 // this will return the attributes of the first symbol in that word.
 bool LTRResultIterator::SymbolIsSuperscript() const {
  if (cblob_it_ == nullptr && it_->word() != nullptr)
-    return it_->word()->best_choice->BlobPosition(blob_index_) ==
-           SP_SUPERSCRIPT;
+    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
  return false;
 }

@ -373,7 +367,7 @@ bool LTRResultIterator::SymbolIsDropcap() const {
  return false;
 }

-ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
+ChoiceIterator::ChoiceIterator(const LTRResultIterator &result_it) {
  ASSERT_HOST(result_it.it_->word() != nullptr);
  word_res_ = result_it.it_->word();
  oemLSTM_ = word_res_->tesseract->AnyLSTMLang();
@ -383,7 +377,7 @@ ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
  bool lstm_choice_mode = word_res_->tesseract->lstm_choice_mode;
  rating_coefficient_ = word_res_->tesseract->lstm_rating_coefficient;
  blanks_before_word_ = result_it.BlanksBeforeWord();
-  BLOB_CHOICE_LIST* choices = nullptr; 
+  BLOB_CHOICE_LIST *choices = nullptr;
  tstep_index_ = &result_it.blob_index_;
  if (oemLSTM_ && !word_res_->CTC_symbol_choices.empty()) {
    if (!word_res_->CTC_symbol_choices[0].empty() &&
@ -417,8 +411,7 @@ ChoiceIterator::~ChoiceIterator() {
 // are none left.
 bool ChoiceIterator::Next() {
  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
-    if (LSTM_choice_it_ != LSTM_choices_->end() &&
-        next(LSTM_choice_it_) == LSTM_choices_->end()) {
+    if (LSTM_choice_it_ != LSTM_choices_->end() && next(LSTM_choice_it_) == LSTM_choices_->end()) {
      return false;
    } else {
      ++LSTM_choice_it_;
@ -434,9 +427,9 @@ bool ChoiceIterator::Next() {

 // Returns the null terminated UTF-8 encoded text string for the current
 // choice. Do NOT use delete [] to free after use.
-const char* ChoiceIterator::GetUTF8Text() const {
+const char *ChoiceIterator::GetUTF8Text() const {
  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
-    std::pair<const char*, float> choice = *LSTM_choice_it_;
+    std::pair<const char *, float> choice = *LSTM_choice_it_;
    return choice.first;
  } else {
    if (choice_it_ == nullptr)
@ -455,7 +448,7 @@ const char* ChoiceIterator::GetUTF8Text() const {
 float ChoiceIterator::Confidence() const {
  float confidence;
  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
-    std::pair<const char*, float> choice = *LSTM_choice_it_;
+    std::pair<const char *, float> choice = *LSTM_choice_it_;
    confidence = 100 - rating_coefficient_ * choice.second;
  } else {
    if (choice_it_ == nullptr)
@ -466,8 +459,7 @@ float ChoiceIterator::Confidence() const {
 }

 // Returns the set of timesteps which belong to the current symbol
-std::vector<std::vector<std::pair<const char*, float>>>*
-ChoiceIterator::Timesteps() const {
+std::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {
  int offset = *tstep_index_ + blanks_before_word_;
  if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
    return nullptr;
@ -478,7 +470,7 @@ ChoiceIterator::Timesteps() const {
 void ChoiceIterator::filterSpaces() {
  if (LSTM_choices_->empty())
    return;
-  std::vector<std::pair<const char*, float>>::iterator it;
+  std::vector<std::pair<const char *, float>>::iterator it;
  for (it = LSTM_choices_->begin(); it != LSTM_choices_->end();) {
    if (!strcmp(it->first, " ")) {
      it = LSTM_choices_->erase(it);
@ -487,4 +479,4 @@ void ChoiceIterator::filterSpaces() {
    }
  }
 }
-}  // namespace tesseract.
+} // namespace tesseract.
--- a/src/ccmain/mutableiterator.cpp
+++ b/src/ccmain/mutableiterator.cpp
@ -21,4 +21,4 @@ namespace tesseract {
 // instead of weak vtables in every compilation unit.
 MutableIterator::~MutableIterator() = default;

-}  // namespace tesseract.
+} // namespace tesseract.
--- a/src/ccmain/mutableiterator.h
+++ b/src/ccmain/mutableiterator.h
@ -3,7 +3,6 @@
 // Description: Iterator for tesseract results providing access to
 //              both high-level API and Tesseract internal data structures.
 // Author:      David Eger
-// Created:     Thu Feb 24 19:01:06 PST 2011
 //
 // (C) Copyright 2011, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -36,29 +35,28 @@ class Tesseract;
 // therefore can only be used while the TessBaseAPI class still exists and
 // has not been subjected to a call of Init, SetImage, Recognize, Clear, End
 // DetectOS, or anything else that changes the internal PAGE_RES.
-// See tesseract/apitypes.h for the definition of PageIteratorLevel.
+// See tesseract/publictypes.h for the definition of PageIteratorLevel.
 // See also base class PageIterator, which contains the bulk of the interface.
 // ResultIterator adds text-specific methods for access to OCR output.
 // MutableIterator adds access to internal data structures.

-class MutableIterator : public ResultIterator {
- public:
+class TESS_API MutableIterator : public ResultIterator {
+public:
  // See argument descriptions in ResultIterator()
-  MutableIterator(PAGE_RES* page_res, Tesseract* tesseract,
-                  int scale, int scaled_yres,
-                  int rect_left, int rect_top,
-                  int rect_width, int rect_height)
-      : ResultIterator(
-          LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
-                            rect_top, rect_width, rect_height)) {}
+  MutableIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
+                  int rect_left, int rect_top, int rect_width, int rect_height)
+      : ResultIterator(LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
+                                         rect_top, rect_width, rect_height)) {}
  ~MutableIterator() override;

  // See PageIterator and ResultIterator for most calls.

  // Return access to Tesseract internals.
-  const PAGE_RES_IT *PageResIt() const { return it_; }
+  const PAGE_RES_IT *PageResIt() const {
+    return it_;
+  }
 };

-}  // namespace tesseract.
+} // namespace tesseract.

-#endif  // TESSERACT_CCMAIN_MUTABLEITERATOR_H_
+#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H_
--- a/src/ccmain/osdetect.cpp
+++ b/src/ccmain/osdetect.cpp
@ -17,10 +17,6 @@
 //
 ///////////////////////////////////////////////////////////////////////

-#include <algorithm>
-#include <cmath>        // for std::fabs
-#include <memory>
-
 #include <tesseract/osdetect.h>

 #include "blobbox.h"
@ -32,11 +28,16 @@
 #include "oldlist.h"
 #include "qrsequence.h"
 #include "ratngs.h"
-#include <tesseract/strngs.h>
 #include "tabvector.h"
 #include "tesseractclass.h"
 #include "textord.h"

+#include <algorithm>
+#include <cmath> // for std::fabs
+#include <memory>
+
+namespace tesseract {
+
 const float kSizeRatioToReject = 2.0;
 const int kMinAcceptableBlobHeight = 10;

@ -48,16 +49,16 @@ const float kHanRatioInJapanese = 0.3;
 const float kNonAmbiguousMargin = 1.0;

 // General scripts
-static const char* han_script = "Han";
-static const char* latin_script = "Latin";
-static const char* katakana_script = "Katakana";
-static const char* hiragana_script = "Hiragana";
-static const char* hangul_script = "Hangul";
+static const char *han_script = "Han";
+static const char *latin_script = "Latin";
+static const char *katakana_script = "Katakana";
+static const char *hiragana_script = "Hiragana";
+static const char *hangul_script = "Hangul";

 // Pseudo-scripts Name
-const char* ScriptDetector::korean_script_ = "Korean";
-const char* ScriptDetector::japanese_script_ = "Japanese";
-const char* ScriptDetector::fraktur_script_ = "Fraktur";
+const char *ScriptDetector::korean_script_ = "Korean";
+const char *ScriptDetector::japanese_script_ = "Japanese";
+const char *ScriptDetector::fraktur_script_ = "Fraktur";

 void OSResults::update_best_orientation() {
  float first = orientations[0];
@ -105,8 +106,8 @@ void OSResults::update_best_script(int orientation) {
      second = scripts_na[orientation][i];
    }
  }
-  best_result.sconfidence = (second == 0.0f) ? 2.0f :
-      (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
+  best_result.sconfidence =
+      (second == 0.0f) ? 2.0f : (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
 }

 int OSResults::get_best_script(int orientation_id) const {
@ -114,8 +115,7 @@ int OSResults::get_best_script(int orientation_id) const {
  for (int j = 0; j < kMaxNumberOfScripts; ++j) {
    const char *script = unicharset->get_script_from_script_id(j);
    if (strcmp(script, "Common") && strcmp(script, "NULL")) {
-      if (max_id == -1 ||
-          scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id])
+      if (max_id == -1 || scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id])
        max_id = j;
    }
  }
@ -135,13 +135,13 @@ void OSResults::print_scores(int orientation_id) const {
  for (int j = 0; j < kMaxNumberOfScripts; ++j) {
    if (scripts_na[orientation_id][j]) {
      tprintf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
-             scripts_na[orientation_id][j]);
+              scripts_na[orientation_id][j]);
    }
  }
 }

 // Accumulate scores with given OSResults instance and update the best script.
-void OSResults::accumulate(const OSResults& osr) {
+void OSResults::accumulate(const OSResults &osr) {
  for (int i = 0; i < 4; ++i) {
    orientations[i] += osr.orientations[i];
    for (int j = 0; j < kMaxNumberOfScripts; ++j)
@ -154,8 +154,7 @@ void OSResults::accumulate(const OSResults& osr) {

 // Detect and erase horizontal/vertical lines and picture regions from the
 // image, so that non-text blobs are removed from consideration.
-static void remove_nontext_regions(tesseract::Tesseract *tess,
-                                   BLOCK_LIST *blocks,
+static void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
                                   TO_BLOCK_LIST *to_blocks) {
  Pix *pix = tess->pix_binary();
  ASSERT_HOST(pix != nullptr);
@ -166,37 +165,31 @@ static void remove_nontext_regions(tesseract::Tesseract *tess,
  int resolution;
  if (kMinCredibleResolution > pixGetXRes(pix)) {
    resolution = kMinCredibleResolution;
-    tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n",
-            pixGetXRes(pix), resolution);
+    tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", pixGetXRes(pix), resolution);
  } else {
    resolution = pixGetXRes(pix);
  }

-  tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix,
-                                            &vertical_x, &vertical_y,
+  tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix, &vertical_x, &vertical_y,
                                            nullptr, &v_lines, &h_lines);
-  Pix* im_pix = tesseract::ImageFind::FindImages(pix, nullptr);
+  Pix *im_pix = tesseract::ImageFind::FindImages(pix, nullptr);
  if (im_pix != nullptr) {
    pixSubtract(pix, pix, im_pix);
    pixDestroy(&im_pix);
  }
-  tess->mutable_textord()->find_components(tess->pix_binary(),
-                                           blocks, to_blocks);
+  tess->mutable_textord()->find_components(tess->pix_binary(), blocks, to_blocks);
 }

 // Find connected components in the page and process a subset until finished or
 // a stopping criterion is met.
 // Returns the number of blobs used in making the estimate. 0 implies failure.
-int orientation_and_script_detection(STRING& filename,
-                                     OSResults* osr,
-                                     tesseract::Tesseract* tess) {
-  STRING name = filename;        //truncated name
-  const char *lastdot;           //of name
-  TBOX page_box;
+int orientation_and_script_detection(const char *filename, OSResults *osr,
+                                     tesseract::Tesseract *tess) {
+  std::string name = filename; // truncated name

-  lastdot = strrchr(name.c_str(), '.');
+  const char *lastdot = strrchr(name.c_str(), '.');
  if (lastdot != nullptr)
-    name[lastdot-name.c_str()] = '\0';
+    name[lastdot - name.c_str()] = '\0';

  ASSERT_HOST(tess->pix_binary() != nullptr);
  int width = pixGetWidth(tess->pix_binary());
@ -212,16 +205,11 @@ int orientation_and_script_detection(STRING& filename,

  if (port_blocks.empty()) {
    // page segmentation did not succeed, so we need to find_components first.
-    tess->mutable_textord()->find_components(tess->pix_binary(),
-                                             &blocks, &port_blocks);
+    tess->mutable_textord()->find_components(tess->pix_binary(), &blocks, &port_blocks);
  } else {
-    page_box.set_left(0);
-    page_box.set_bottom(0);
-    page_box.set_right(width);
-    page_box.set_top(height);
+    TBOX page_box(0, 0, width, height);
    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
-    tess->mutable_textord()->filter_blobs(page_box.topright(),
-                                          &port_blocks, true);
+    tess->mutable_textord()->filter_blobs(page_box.topright(), &port_blocks, true);
  }

  return os_detect(&port_blocks, osr, tess);
@ -230,8 +218,7 @@ int orientation_and_script_detection(STRING& filename,
 // Filter and sample the blobs.
 // Returns a non-zero number of blobs if the page was successfully processed, or
 // zero if the page had too few characters to be reliable
-int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
-              tesseract::Tesseract* tess) {
+int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, tesseract::Tesseract *tess) {
  int blobs_total = 0;
  TO_BLOCK_IT block_it;
  block_it.set_to_list(port_blocks);
@ -239,30 +226,31 @@ int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
  BLOBNBOX_CLIST filtered_list;
  BLOBNBOX_C_IT filtered_it(&filtered_list);

-  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
-       block_it.forward ()) {
-    TO_BLOCK* to_block = block_it.data();
-    if (to_block->block->pdblk.poly_block() &&
-        !to_block->block->pdblk.poly_block()->IsText()) continue;
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+    TO_BLOCK *to_block = block_it.data();
+    if (to_block->block->pdblk.poly_block() && !to_block->block->pdblk.poly_block()->IsText())
+      continue;
    BLOBNBOX_IT bbox_it;
    bbox_it.set_to_list(&to_block->blobs);
-    for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list ();
-         bbox_it.forward ()) {
-      BLOBNBOX* bbox = bbox_it.data();
-      C_BLOB*   blob = bbox->cblob();
-      TBOX      box = blob->bounding_box();
+    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
+      BLOBNBOX *bbox = bbox_it.data();
+      C_BLOB *blob = bbox->cblob();
+      TBOX box = blob->bounding_box();
      ++blobs_total;

      // Catch illegal value of box width and avoid division by zero.
-      if (box.width() == 0) continue;
+      if (box.width() == 0)
+        continue;
      // TODO: Can height and width be negative? If not, remove fabs.
      float y_x = std::fabs((box.height() * 1.0f) / box.width());
      float x_y = 1.0f / y_x;
      // Select a >= 1.0 ratio
      float ratio = x_y > y_x ? x_y : y_x;
      // Blob is ambiguous
-      if (ratio > kSizeRatioToReject) continue;
-      if (box.height() < kMinAcceptableBlobHeight) continue;
+      if (ratio > kSizeRatioToReject)
+        continue;
+      if (box.height() < kMinAcceptableBlobHeight)
+        continue;
      filtered_it.add_to_end(bbox);
    }
  }
@ -275,9 +263,8 @@ int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
 // If allowed_scripts is non-null and non-empty, it is a list of scripts that
 // constrains both orientation and script detection to consider only scripts
 // from the list.
-int os_detect_blobs(const GenericVector<int>* allowed_scripts,
-                    BLOBNBOX_CLIST* blob_list, OSResults* osr,
-                    tesseract::Tesseract* tess) {
+int os_detect_blobs(const std::vector<int> *allowed_scripts, BLOBNBOX_CLIST *blob_list,
+                    OSResults *osr, tesseract::Tesseract *tess) {
  OSResults osr_;
  int minCharactersToTry = tess->min_characters_to_try;
  int maxCharactersToTry = 5 * minCharactersToTry;
@ -300,22 +287,20 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
    return 0;
  }

-  auto** blobs = new BLOBNBOX*[filtered_it.length()];
+  auto **blobs = new BLOBNBOX *[filtered_it.length()];
  int number_of_blobs = 0;
-  for (filtered_it.mark_cycle_pt (); !filtered_it.cycled_list ();
-       filtered_it.forward ()) {
+  for (filtered_it.mark_cycle_pt(); !filtered_it.cycled_list(); filtered_it.forward()) {
    blobs[number_of_blobs++] = filtered_it.data();
  }
  QRSequenceGenerator sequence(number_of_blobs);
  int num_blobs_evaluated = 0;
  for (int i = 0; i < real_max; ++i) {
-    if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
-        && i > minCharactersToTry) {
+    if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess) && i > minCharactersToTry) {
      break;
    }
    ++num_blobs_evaluated;
  }
-  delete [] blobs;
+  delete[] blobs;

  // Make sure the best_result is up-to-date
  int orientation = o.get_orientation();
@ -326,13 +311,12 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
 // Processes a single blob to estimate script and orientation.
 // Return true if estimate of orientation and script satisfies stopping
 // criteria.
-bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
-                    ScriptDetector* s, OSResults* osr,
-                    tesseract::Tesseract* tess) {
+bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, OSResults *osr,
+                    tesseract::Tesseract *tess) {
  tess->tess_cn_matching.set_value(true); // turn it on
  tess->tess_bn_matching.set_value(false);
-  C_BLOB* blob = bbox->cblob();
-  TBLOB* tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
+  C_BLOB *blob = bbox->cblob();
+  TBLOB *tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
  TBOX box = tblob->bounding_box();
  FCOORD current_rotation(1.0f, 0.0f);
  FCOORD rotation90(0.0f, 1.0f);
@ -354,10 +338,8 @@ bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
      x_origin = i == 1 ? box.left() : box.right();
    }
    std::unique_ptr<TBLOB> rotated_blob(new TBLOB(*tblob));
-    rotated_blob->Normalize(nullptr, &current_rotation, nullptr,
-                            x_origin, y_origin, scaling, scaling,
-                            0.0f, static_cast<float>(kBlnBaselineOffset),
-                            false, nullptr);
+    rotated_blob->Normalize(nullptr, &current_rotation, nullptr, x_origin, y_origin, scaling,
+                            scaling, 0.0f, static_cast<float>(kBlnBaselineOffset), false, nullptr);
    tess->AdaptiveClassifier(rotated_blob.get(), ratings + i);
    current_rotation.rotate(rotation90);
  }
@ -370,27 +352,25 @@ bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
  return stop;
 }

-
-OrientationDetector::OrientationDetector(
-    const GenericVector<int>* allowed_scripts, OSResults* osr) {
+OrientationDetector::OrientationDetector(const std::vector<int> *allowed_scripts, OSResults *osr) {
  osr_ = osr;
  allowed_scripts_ = allowed_scripts;
 }

 // Score the given blob and return true if it is now sure of the orientation
 // after adding this block.
-bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
+bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
  float blob_o_score[4] = {0.0f, 0.0f, 0.0f, 0.0f};
  float total_blob_o_score = 0.0f;

  for (int i = 0; i < 4; ++i) {
    BLOB_CHOICE_IT choice_it(scores + i);
    if (!choice_it.empty()) {
-      BLOB_CHOICE* choice = nullptr;
+      BLOB_CHOICE *choice = nullptr;
      if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
        // Find the top choice in an allowed script.
-        for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() &&
-             choice == nullptr; choice_it.forward()) {
+        for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && choice == nullptr;
+             choice_it.forward()) {
          int choice_script = choice_it.data()->script_id();
          int s = 0;
          for (s = 0; s < allowed_scripts_->size(); ++s) {
@ -411,7 +391,8 @@ bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
      }
    }
  }
-  if (total_blob_o_score == 0.0) return false;
+  if (total_blob_o_score == 0.0)
+    return false;
  // Fill in any blanks with the worst score of the others. This is better than
  // picking an arbitrary probability for it and way better than -inf.
  float worst_score = 0.0f;
@ -427,7 +408,7 @@ bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
    // Lower worst if there is only one.
    worst_score /= 2.0f;
  }
-  for (float& f : blob_o_score) {
+  for (float &f : blob_o_score) {
    if (f == 0.0f) {
      f = worst_score;
      total_blob_o_score += worst_score;
@ -449,9 +430,8 @@ int OrientationDetector::get_orientation() {
  return osr_->best_result.orientation_id;
 }

-
-ScriptDetector::ScriptDetector(const GenericVector<int>* allowed_scripts,
-                               OSResults* osr, tesseract::Tesseract* tess) {
+ScriptDetector::ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,
+                               tesseract::Tesseract *tess) {
  osr_ = osr;
  tess_ = tess;
  allowed_scripts_ = allowed_scripts;
@ -465,12 +445,11 @@ ScriptDetector::ScriptDetector(const GenericVector<int>* allowed_scripts,
  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
 }

-
 // Score the given blob and return true if it is now sure of the script after
 // adding this blob.
-void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
+void ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
  for (int i = 0; i < 4; ++i) {
-    bool done[kMaxNumberOfScripts] = { false };
+    bool done[kMaxNumberOfScripts] = {false};

    BLOB_CHOICE_IT choice_it;
    choice_it.set_to_list(scores + i);
@ -479,23 +458,25 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
    int script_count = 0;
    int prev_id = -1;
    int prev_fontinfo_id = -1;
-    const char* prev_unichar = "";
-    const char* unichar = "";
+    const char *prev_unichar = "";
+    const char *unichar = "";

-    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
-         choice_it.forward()) {
-      BLOB_CHOICE* choice = choice_it.data();
+    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
+      BLOB_CHOICE *choice = choice_it.data();
      int id = choice->script_id();
      if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
        // Check that the choice is in an allowed script.
        int s = 0;
        for (s = 0; s < allowed_scripts_->size(); ++s) {
-          if ((*allowed_scripts_)[s] == id) break;
+          if ((*allowed_scripts_)[s] == id)
+            break;
        }
-        if (s == allowed_scripts_->size()) continue;  // Not found in list.
+        if (s == allowed_scripts_->size())
+          continue; // Not found in list.
      }
      // Script already processed before.
-      if (done[id]) continue;
+      if (done[id])
+        continue;
      done[id] = true;

      unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
@ -527,9 +508,8 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
      // Workaround for Fraktur
      if (prev_id == latin_id_) {
        if (prev_fontinfo_id >= 0) {
-          const tesseract::FontInfo &fi =
-              tess_->get_fontinfo_table().get(prev_fontinfo_id);
-          //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
+          const tesseract::FontInfo &fi = tess_->get_fontinfo_table().get(prev_fontinfo_id);
+          // printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
          //       fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
          //       fi.is_serif(), fi.is_fraktur(),
          //       prev_unichar);
@ -552,7 +532,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
        osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
      }
    }
-  }  // iterate over each orientation
+  } // iterate over each orientation
 }

 bool ScriptDetector::must_stop(int orientation) {
@ -563,7 +543,7 @@ bool ScriptDetector::must_stop(int orientation) {
 // Helper method to convert an orientation index to its value in degrees.
 // The value represents the amount of clockwise rotation in degrees that must be
 // applied for the text to be upright (readable).
-int OrientationIdToValue(const int& id) {
+int OrientationIdToValue(const int &id) {
  switch (id) {
    case 0:
      return 0;
@ -577,3 +557,5 @@ int OrientationIdToValue(const int& id) {
      return -1;
  }
 }
+
+} // namespace tesseract
--- a/src/ccmain/output.cpp
+++ b/src/ccmain/output.cpp
@ -16,73 +16,72 @@
 *
 **********************************************************************/

-#include <cctype>
-#include <cerrno>
-#include <cstring>
-#include "control.h"
-#include <tesseract/helpers.h>
 #include "output.h"
+
+#include "control.h"
 #include "tesseractclass.h"
 #include "tessvars.h"
 #ifndef DISABLED_LEGACY_ENGINE
-#include "docqual.h"
-#include "reject.h"
+#  include "docqual.h"
+#  include "reject.h"
 #endif

-#define CTRL_NEWLINE    '\012'   //newline
-#define CTRL_HARDLINE   '\015'   //cr
+#include "helpers.h"
+
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+
+#define CTRL_NEWLINE '\012'  // newline
+#define CTRL_HARDLINE '\015' // cr

 namespace tesseract {
-void Tesseract::output_pass(  //Tess output pass //send to api
-                            PAGE_RES_IT &page_res_it,
-                            const TBOX *target_word_box) {
+void Tesseract::output_pass( // Tess output pass //send to api
+    PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
  BLOCK_RES *block_of_last_word;
-  bool force_eol;               //During output
-  BLOCK *nextblock;              //block of next word
-  WERD *nextword;                //next word
+  bool force_eol;   // During output
+  BLOCK *nextblock; // block of next word
+  WERD *nextword;   // next word

-  page_res_it.restart_page ();
+  page_res_it.restart_page();
  block_of_last_word = nullptr;
-  while (page_res_it.word () != nullptr) {
-    check_debug_pt (page_res_it.word (), 120);
+  while (page_res_it.word() != nullptr) {
+    check_debug_pt(page_res_it.word(), 120);

    if (target_word_box) {
      TBOX current_word_box = page_res_it.word()->word->bounding_box();
-      FCOORD center_pt(
-          (current_word_box.right() + current_word_box.left()) / 2,
-          (current_word_box.bottom() + current_word_box.top()) / 2);
+      FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
+                       (current_word_box.bottom() + current_word_box.top()) / 2);
      if (!target_word_box->contains(center_pt)) {
        page_res_it.forward();
        continue;
      }
    }
-    if (tessedit_write_block_separators &&
-    block_of_last_word != page_res_it.block ()) {
-      block_of_last_word = page_res_it.block ();
+    if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
+      block_of_last_word = page_res_it.block();
    }

-    force_eol = (tessedit_write_block_separators &&
-      (page_res_it.block () != page_res_it.next_block ())) ||
-      (page_res_it.next_word () == nullptr);
+    force_eol =
+        (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
+        (page_res_it.next_word() == nullptr);

-    if (page_res_it.next_word () != nullptr)
-      nextword = page_res_it.next_word ()->word;
+    if (page_res_it.next_word() != nullptr)
+      nextword = page_res_it.next_word()->word;
    else
      nextword = nullptr;
-    if (page_res_it.next_block () != nullptr)
-      nextblock = page_res_it.next_block ()->block;
+    if (page_res_it.next_block() != nullptr)
+      nextblock = page_res_it.next_block()->block;
    else
      nextblock = nullptr;
-                                 //regardless of tilde crunching
+    // regardless of tilde crunching
    write_results(page_res_it,
-                  determine_newline_type(page_res_it.word()->word,
-                                         page_res_it.block()->block,
-                                         nextword, nextblock), force_eol);
+                  determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
+                                         nextword, nextblock),
+                  force_eol);
    page_res_it.forward();
  }
 }

-
 /*************************************************************************
 * write_results()
 *
@ -95,8 +94,8 @@ void Tesseract::output_pass(  //Tess output pass //send to api
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
-void Tesseract::write_results(PAGE_RES_IT& page_res_it,
-                              char newline_type,  // type of newline
+void Tesseract::write_results(PAGE_RES_IT &page_res_it,
+                              char newline_type, // type of newline
                              bool force_eol) {  // override tilde crunch?
  WERD_RES *word = page_res_it.word();
  const UNICHARSET &uchset = *word->uch_set;
@ -104,19 +103,14 @@ void Tesseract::write_results(PAGE_RES_IT& page_res_it,
  bool need_reject = false;
  UNICHAR_ID space = uchset.unichar_to_id(" ");

-  if ((word->unlv_crunch_mode != CR_NONE ||
-       word->best_choice->length() == 0) &&
+  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) &&
      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
    if ((word->unlv_crunch_mode != CR_DELETE) &&
        (!stats_.tilde_crunch_written ||
-         ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
-          (word->word->space () > 0) &&
-          !word->word->flag (W_FUZZY_NON) &&
-          !word->word->flag (W_FUZZY_SP)))) {
-      if (!word->word->flag (W_BOL) &&
-          (word->word->space () > 0) &&
-          !word->word->flag (W_FUZZY_NON) &&
-          !word->word->flag (W_FUZZY_SP)) {
+         ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
+          !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
+      if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
+          !word->word->flag(W_FUZZY_SP)) {
        stats_.last_char_was_tilde = false;
      }
      need_reject = true;
@ -130,7 +124,7 @@ void Tesseract::write_results(PAGE_RES_IT& page_res_it,
      stats_.write_results_empty_block = false;
    }

-    if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
+    if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
      stats_.tilde_crunch_written = false;
      stats_.last_char_was_newline = true;
      stats_.last_char_was_tilde = false;
@ -148,40 +142,35 @@ void Tesseract::write_results(PAGE_RES_IT& page_res_it,
    stats_.last_char_was_newline = true;
  else
    stats_.last_char_was_newline = false;
-  stats_.write_results_empty_block = force_eol;  // about to write a real word
+  stats_.write_results_empty_block = force_eol; // about to write a real word

-  if (unlv_tilde_crunching &&
-      stats_.last_char_was_tilde &&
-      (word->word->space() == 0) &&
+  if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
-       words have been removed */
+   words have been removed */
    word->MergeAdjacentBlobs(0);
  }
-  if (newline_type ||
-    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
+  if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes))
    stats_.last_char_was_tilde = false;
  else {
-    if (word->reject_map.length () > 0) {
+    if (word->reject_map.length() > 0) {
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
        stats_.last_char_was_tilde = true;
      else
        stats_.last_char_was_tilde = false;
-    }
-    else if (word->word->space () > 0)
+    } else if (word->word->space() > 0)
      stats_.last_char_was_tilde = false;
    /* else it is unchanged as there are no output chars */
  }

-  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
+  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());

  set_unlv_suspects(word);
-  check_debug_pt (word, 120);
+  check_debug_pt(word, 120);
  if (tessedit_rejection_debug) {
-    tprintf ("Dict word: \"%s\": %d\n",
-             word->best_choice->debug_string().c_str(),
-             dict_word(*(word->best_choice)));
+    tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
+            dict_word(*(word->best_choice)));
  }
  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
    if (tessedit_zero_rejection) {
@ -194,14 +183,12 @@ void Tesseract::write_results(PAGE_RES_IT& page_res_it,
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
-        if ((word->best_choice->unichar_id(i) != space) &&
-            word->reject_map[i].rejected())
+        if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }
 }
-}  // namespace tesseract

 /**********************************************************************
 * determine_newline_type
@ -210,31 +197,31 @@ void Tesseract::write_results(PAGE_RES_IT& page_res_it,
 * Return false if not at end of line.
 **********************************************************************/

-char determine_newline_type(                   //test line ends
-                            WERD *word,        //word to do
-                            BLOCK *block,      //current block
-                            WERD *next_word,   //next word
-                            BLOCK *next_block  //block of next word
-                           ) {
-  int16_t end_gap;                 //to right edge
-  int16_t width;                   //of next word
-  TBOX word_box;                  //bounding
-  TBOX next_box;                  //next word
-  TBOX block_box;                 //block bounding
+char determine_newline_type( // test line ends
+    WERD *word,              // word to do
+    BLOCK *block,            // current block
+    WERD *next_word,         // next word
+    BLOCK *next_block        // block of next word
+) {
+  int16_t end_gap; // to right edge
+  int16_t width;   // of next word
+  TBOX word_box;   // bounding
+  TBOX next_box;   // next word
+  TBOX block_box;  // block bounding

-  if (!word->flag (W_EOL))
-    return false;                //not end of line
+  if (!word->flag(W_EOL))
+    return false; // not end of line
  if (next_word == nullptr || next_block == nullptr || block != next_block)
    return CTRL_NEWLINE;
-  if (next_word->space () > 0)
-    return CTRL_HARDLINE;        //it is tabbed
-  word_box = word->bounding_box ();
-  next_box = next_word->bounding_box ();
-  block_box = block->pdblk.bounding_box ();
-                                 //gap to eol
-  end_gap = block_box.right () - word_box.right ();
-  end_gap -= static_cast<int32_t>(block->space ());
-  width = next_box.right () - next_box.left ();
+  if (next_word->space() > 0)
+    return CTRL_HARDLINE; // it is tabbed
+  word_box = word->bounding_box();
+  next_box = next_word->bounding_box();
+  block_box = block->pdblk.bounding_box();
+  // gap to eol
+  end_gap = block_box.right() - word_box.right();
+  end_gap -= static_cast<int32_t>(block->space());
+  width = next_box.right() - next_box.left();
  //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
  //              block_box.right(),word_box.right(),end_gap,
  //              next_box.right(),next_box.left(),width,
@ -247,11 +234,10 @@ char determine_newline_type(                   //test line ends
 * Return the first accepted character from the repetition string. This is the
 * character which is repeated - as determined earlier by fix_rep_char()
 *************************************************************************/
-namespace tesseract {
-UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
+UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
  int i;
-  for (i = 0; ((i < word->reject_map.length()) &&
-               (word->reject_map[i].rejected())); ++i);
+  for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i)
+    ;

  if (i < word->reject_map.length()) {
    return word->best_choice->unichar_id(i);
@ -286,16 +272,14 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
  }

  if (suspect_level >= 3)
-    return;                      //Use defaults
+    return; // Use defaults

  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/

-  if (safe_dict_word(word_res) &&
-      (count_alphas(word) > suspect_short_words)) {
+  if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
    /* Unreject alphas in dictionary words */
    for (i = 0; i < len; ++i) {
-      if (word_res->reject_map[i].rejected() &&
-          uchset.get_isalpha(word.unichar_id(i)))
+      if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i)))
        word_res->reject_map[i].setrej_minimal_rej_accept();
    }
  }
@ -303,13 +287,12 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
  rating_per_ch = word.rating() / word_res->reject_map.length();

  if (rating_per_ch >= suspect_rating_per_ch)
-    return;  // Don't touch bad ratings
+    return; // Don't touch bad ratings

  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
    for (i = 0; i < len; ++i) {
-      if (word_res->reject_map[i].rejected() &&
-          (!uchset.eq(word.unichar_id(i), " ")))
+      if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " ")))
        word_res->reject_map[i].setrej_minimal_rej_accept();
    }
  }
@ -328,34 +311,28 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
  if (suspect_level == 2)
    return;

-  if (!suspect_constrain_1Il ||
-      (word_res->reject_map.length() <= suspect_short_words)) {
+  if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
    for (i = 0; i < len; i++) {
      if (word_res->reject_map[i].rejected()) {
        if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
-          word_res->reject_map[i].flag(R_POSTNN_1IL)))
+             word_res->reject_map[i].flag(R_POSTNN_1IL)))
          word_res->reject_map[i].setrej_minimal_rej_accept();

-        if (!suspect_constrain_1Il &&
-          word_res->reject_map[i].flag(R_MM_REJECT))
+        if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT))
          word_res->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }

-  if (acceptable_word_string(*word_res->uch_set,
-                             word.unichar_string().c_str(),
-                             word.unichar_lengths().c_str()) !=
-                                 AC_UNACCEPTABLE ||
-      acceptable_number_string(word.unichar_string().c_str(),
-                               word.unichar_lengths().c_str())) {
+  if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
+                             word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
+      acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
    if (word_res->reject_map.length() > suspect_short_words) {
      for (i = 0; i < len; i++) {
-        if (word_res->reject_map[i].rejected() &&
-          (!word_res->reject_map[i].perm_rejected() ||
-           word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
-           word_res->reject_map[i].flag (R_POSTNN_1IL) ||
-           word_res->reject_map[i].flag (R_MM_REJECT))) {
+        if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
+                                                   word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
+                                                   word_res->reject_map[i].flag(R_POSTNN_1IL) ||
+                                                   word_res->reject_map[i].flag(R_MM_REJECT))) {
          word_res->reject_map[i].setrej_minimal_rej_accept();
        }
      }
@ -372,7 +349,6 @@ int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
  return count;
 }

-
 int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
  int count = 0;
  for (int i = 0; i < word.length(); ++i) {
@ -383,29 +359,24 @@ int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
  return count;
 }

-
-bool Tesseract::acceptable_number_string(const char* s,
-                                         const char* lengths) {
+bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
  bool prev_digit = false;

  if (*lengths == 1 && *s == '(')
    s++;

-  if (*lengths == 1 &&
-      ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
+  if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
    s++;

  for (; *s != '\0'; s += *(lengths++)) {
    if (unicharset.get_isdigit(s, *lengths))
      prev_digit = true;
-    else if (prev_digit &&
-             (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
+    else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
      prev_digit = false;
-    else if (prev_digit && *lengths == 1 &&
-             (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
+    else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
+             ((*s == '%') || (*s == ')')))
      return true;
-    else if (prev_digit &&
-             *lengths == 1 && (*s == '%') &&
+    else if (prev_digit && *lengths == 1 && (*s == '%') &&
             (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
             (*(s + *lengths + *(lengths + 1)) == '\0'))
      return true;
@ -414,4 +385,4 @@ bool Tesseract::acceptable_number_string(const char* s,
  }
  return true;
 }
-}  // namespace tesseract
+} // namespace tesseract
--- a/src/ccmain/output.h
+++ b/src/ccmain/output.h
@ -20,14 +20,18 @@
 #ifndef OUTPUT_H
 #define OUTPUT_H

+namespace tesseract {
+
 class BLOCK;
 class WERD;

 /** test line ends */
-char determine_newline_type(WERD *word,        ///< word to do
-                            BLOCK *block,      ///< current block
-                            WERD *next_word,   ///< next word
-                            BLOCK *next_block  ///< block of next word
-                           );
+char determine_newline_type(WERD *word,       ///< word to do
+                            BLOCK *block,     ///< current block
+                            WERD *next_word,  ///< next word
+                            BLOCK *next_block ///< block of next word
+);
+
+} // namespace tesseract

 #endif
--- a/src/ccmain/pageiterator.cpp
+++ b/src/ccmain/pageiterator.cpp
@ -17,9 +17,9 @@
 //
 ///////////////////////////////////////////////////////////////////////

+#include <allheaders.h>
 #include <tesseract/pageiterator.h>
-#include "allheaders.h"
-#include <tesseract/helpers.h>
+#include "helpers.h"
 #include "pageres.h"
 #include "tesseractclass.h"

@ -27,23 +27,22 @@

 namespace tesseract {

-PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
-                           int scaled_yres, int rect_left, int rect_top,
-                           int rect_width, int rect_height)
-    : page_res_(page_res),
-      tesseract_(tesseract),
-      word_(nullptr),
-      word_length_(0),
-      blob_index_(0),
-      cblob_it_(nullptr),
-      include_upper_dots_(false),
-      include_lower_dots_(false),
-      scale_(scale),
-      scaled_yres_(scaled_yres),
-      rect_left_(rect_left),
-      rect_top_(rect_top),
-      rect_width_(rect_width),
-      rect_height_(rect_height) {
+PageIterator::PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
+                           int rect_left, int rect_top, int rect_width, int rect_height)
+    : page_res_(page_res)
+    , tesseract_(tesseract)
+    , word_(nullptr)
+    , word_length_(0)
+    , blob_index_(0)
+    , cblob_it_(nullptr)
+    , include_upper_dots_(false)
+    , include_lower_dots_(false)
+    , scale_(scale)
+    , scaled_yres_(scaled_yres)
+    , rect_left_(rect_left)
+    , rect_top_(rect_top)
+    , rect_width_(rect_width)
+    , rect_height_(rect_height) {
  it_ = new PAGE_RES_IT(page_res);
  PageIterator::Begin();
 }
@ -58,26 +57,26 @@ PageIterator::~PageIterator() {
 * all the objects at a lower level, while maintaining an iterator to
 * objects at a higher level.
 */
-PageIterator::PageIterator(const PageIterator& src)
-    : page_res_(src.page_res_),
-      tesseract_(src.tesseract_),
-      word_(nullptr),
-      word_length_(src.word_length_),
-      blob_index_(src.blob_index_),
-      cblob_it_(nullptr),
-      include_upper_dots_(src.include_upper_dots_),
-      include_lower_dots_(src.include_lower_dots_),
-      scale_(src.scale_),
-      scaled_yres_(src.scaled_yres_),
-      rect_left_(src.rect_left_),
-      rect_top_(src.rect_top_),
-      rect_width_(src.rect_width_),
-      rect_height_(src.rect_height_) {
+PageIterator::PageIterator(const PageIterator &src)
+    : page_res_(src.page_res_)
+    , tesseract_(src.tesseract_)
+    , word_(nullptr)
+    , word_length_(src.word_length_)
+    , blob_index_(src.blob_index_)
+    , cblob_it_(nullptr)
+    , include_upper_dots_(src.include_upper_dots_)
+    , include_lower_dots_(src.include_lower_dots_)
+    , scale_(src.scale_)
+    , scaled_yres_(src.scaled_yres_)
+    , rect_left_(src.rect_left_)
+    , rect_top_(src.rect_top_)
+    , rect_width_(src.rect_width_)
+    , rect_height_(src.rect_height_) {
  it_ = new PAGE_RES_IT(*src.it_);
  BeginWord(src.blob_index_);
 }

-const PageIterator& PageIterator::operator=(const PageIterator& src) {
+const PageIterator &PageIterator::operator=(const PageIterator &src) {
  page_res_ = src.page_res_;
  tesseract_ = src.tesseract_;
  include_upper_dots_ = src.include_upper_dots_;
@ -94,9 +93,9 @@ const PageIterator& PageIterator::operator=(const PageIterator& src) {
  return *this;
 }

-bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT* other) const {
+bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT *other) const {
  return (it_ == nullptr && it_ == other) ||
-     ((other != nullptr) && (it_ != nullptr) && (*it_ == *other));
+         ((other != nullptr) && (it_ != nullptr) && (*it_ == *other));
 }

 // ============= Moving around within the page ============.
@ -108,7 +107,8 @@ void PageIterator::Begin() {
 }

 void PageIterator::RestartParagraph() {
-  if (it_->block() == nullptr) return; // At end of the document.
+  if (it_->block() == nullptr)
+    return; // At end of the document.
  PAGE_RES_IT para(page_res_);
  PAGE_RES_IT next_para(para);
  next_para.forward_paragraph();
@ -145,7 +145,8 @@ void PageIterator::RestartRow() {
 * the appropriate language has been loaded into Tesseract.
 */
 bool PageIterator::Next(PageIteratorLevel level) {
-  if (it_->block() == nullptr) return false;  // Already at the end!
+  if (it_->block() == nullptr)
+    return false; // Already at the end!
  if (it_->word() == nullptr)
    level = RIL_BLOCK;

@ -157,8 +158,8 @@ bool PageIterator::Next(PageIteratorLevel level) {
      it_->forward_paragraph();
      break;
    case RIL_TEXTLINE:
-      for (it_->forward_with_empties(); it_->row() == it_->prev_row();
-           it_->forward_with_empties());
+      for (it_->forward_with_empties(); it_->row() == it_->prev_row(); it_->forward_with_empties())
+        ;
      break;
    case RIL_WORD:
      it_->forward_with_empties();
@ -183,15 +184,16 @@ bool PageIterator::Next(PageIteratorLevel level) {
 * moved to the start of a RIL_PARA.
 */
 bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
-  if (it_->block() == nullptr) return false;  // Already at the end!
-  if (it_->word() == nullptr) return true;  // In an image block.
+  if (it_->block() == nullptr)
+    return false; // Already at the end!
+  if (it_->word() == nullptr)
+    return true; // In an image block.
  switch (level) {
    case RIL_BLOCK:
      return blob_index_ == 0 && it_->block() != it_->prev_block();
    case RIL_PARA:
-      return blob_index_ == 0 &&
-          (it_->block() != it_->prev_block() ||
-           it_->row()->row->para() != it_->prev_row()->row->para());
+      return blob_index_ == 0 && (it_->block() != it_->prev_block() ||
+                                  it_->row()->row->para() != it_->prev_row()->row->para());
    case RIL_TEXTLINE:
      return blob_index_ == 0 && it_->row() != it_->prev_row();
    case RIL_WORD:
@ -206,9 +208,9 @@ bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
 * Returns whether the iterator is positioned at the last element in a
 * given level. (e.g. the last word in a line, the last line in a block)
 */
-bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
-                                    PageIteratorLevel element) const {
-  if (Empty(element)) return true;  // Already at the end!
+bool PageIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {
+  if (Empty(element))
+    return true; // Already at the end!
  // The result is true if we step forward by element and find we are
  // at the the end of the page or at beginning of *all* levels in:
  // [level, element).
@ -217,7 +219,8 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
  // word on a line, so we also have to be at the first symbol in a word.
  PageIterator next(*this);
  next.Next(element);
-  if (next.Empty(element)) return true;  // Reached the end of the page.
+  if (next.Empty(element))
+    return true; // Reached the end of the page.
  while (element > level) {
    element = static_cast<PageIteratorLevel>(element - 1);
    if (!next.IsAtBeginningOf(element))
@ -262,28 +265,24 @@ int PageIterator::Cmp(const PageIterator &other) const {
 * See comment on coordinate system above.
 * Returns false if there is no such object at the current position.
 */
-bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
-                                       int* left, int* top,
-                                       int* right, int* bottom) const {
+bool PageIterator::BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right,
+                                       int *bottom) const {
  if (Empty(level))
    return false;
  TBOX box;
  PARA *para = nullptr;
  switch (level) {
    case RIL_BLOCK:
-      box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
-                                                         include_lower_dots_);
+      box = it_->block()->block->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
      break;
    case RIL_PARA:
      para = it_->row()->row->para();
      // Fall through.
    case RIL_TEXTLINE:
-      box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
-                                                     include_lower_dots_);
+      box = it_->row()->row->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
      break;
    case RIL_WORD:
-      box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
-                                                       include_lower_dots_);
+      box = it_->word()->word->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
      break;
    case RIL_SYMBOL:
      if (cblob_it_ == nullptr)
@ -295,10 +294,8 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
    PageIterator other = *this;
    other.Begin();
    do {
-      if (other.it_->block() &&
-          other.it_->block()->block == it_->block()->block &&
-          other.it_->row() && other.it_->row()->row &&
-          other.it_->row()->row->para() == para) {
+      if (other.it_->block() && other.it_->block()->block == it_->block()->block &&
+          other.it_->row() && other.it_->row()->row && other.it_->row()->row->para() == para) {
        box = box.bounding_union(other.it_->row()->row->bounding_box());
      }
    } while (other.Next(RIL_TEXTLINE));
@ -322,65 +319,64 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
 * See comment on coordinate system above.
 * Returns false if there is no such object at the current position.
 */
-bool PageIterator::BoundingBox(PageIteratorLevel level,
-                               int* left, int* top,
-                               int* right, int* bottom) const {
+bool PageIterator::BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
+                               int *bottom) const {
  return BoundingBox(level, 0, left, top, right, bottom);
 }

-bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,
-                               int* left, int* top,
-                               int* right, int* bottom) const {
+bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding, int *left, int *top,
+                               int *right, int *bottom) const {
  if (!BoundingBoxInternal(level, left, top, right, bottom))
    return false;
  // Convert to the coordinate system of the original image.
-  *left = ClipToRange(*left / scale_ + rect_left_ - padding,
-                      rect_left_, rect_left_ + rect_width_);
-  *top = ClipToRange(*top / scale_ + rect_top_ - padding,
-                     rect_top_, rect_top_ + rect_height_);
-  *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,
-                       *left, rect_left_ + rect_width_);
-  *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,
-                        *top, rect_top_ + rect_height_);
+  *left = ClipToRange(*left / scale_ + rect_left_ - padding, rect_left_, rect_left_ + rect_width_);
+  *top = ClipToRange(*top / scale_ + rect_top_ - padding, rect_top_, rect_top_ + rect_height_);
+  *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding, *left,
+                       rect_left_ + rect_width_);
+  *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding, *top,
+                        rect_top_ + rect_height_);
  return true;
 }

 /** Return that there is no such object at a given level. */
 bool PageIterator::Empty(PageIteratorLevel level) const {
-  if (it_->block() == nullptr) return true;  // Already at the end!
-  if (it_->word() == nullptr && level != RIL_BLOCK) return true;  // image block
+  if (it_->block() == nullptr)
+    return true; // Already at the end!
+  if (it_->word() == nullptr && level != RIL_BLOCK)
+    return true; // image block
  if (level == RIL_SYMBOL && blob_index_ >= word_length_)
-    return true;  // Zero length word, or already at the end of it.
+    return true; // Zero length word, or already at the end of it.
  return false;
 }

-/** Returns the type of the current block. See tesseract/apitypes.h for PolyBlockType. */
+/** Returns the type of the current block.
+ *  See tesseract/publictypes.h for PolyBlockType. */
 PolyBlockType PageIterator::BlockType() const {
  if (it_->block() == nullptr || it_->block()->block == nullptr)
-    return PT_UNKNOWN;  // Already at the end!
+    return PT_UNKNOWN; // Already at the end!
  if (it_->block()->block->pdblk.poly_block() == nullptr)
-    return PT_FLOWING_TEXT;  // No layout analysis used - assume text.
+    return PT_FLOWING_TEXT; // No layout analysis used - assume text.
  return it_->block()->block->pdblk.poly_block()->isA();
 }

 /** Returns the polygon outline of the current block. The returned Pta must
 *  be ptaDestroy-ed after use. */
-Pta* PageIterator::BlockPolygon() const {
+Pta *PageIterator::BlockPolygon() const {
  if (it_->block() == nullptr || it_->block()->block == nullptr)
-    return nullptr;  // Already at the end!
+    return nullptr; // Already at the end!
  if (it_->block()->block->pdblk.poly_block() == nullptr)
-    return nullptr;  // No layout analysis used - no polygon.
+    return nullptr; // No layout analysis used - no polygon.
  // Copy polygon, so we can unrotate it to image coordinates.
-  POLY_BLOCK* internal_poly = it_->block()->block->pdblk.poly_block();
+  POLY_BLOCK *internal_poly = it_->block()->block->pdblk.poly_block();
  ICOORDELT_LIST vertices;
  vertices.deep_copy(internal_poly->points(), ICOORDELT::deep_copy);
  POLY_BLOCK poly(&vertices, internal_poly->isA());
  poly.rotate(it_->block()->block->re_rotation());
  ICOORDELT_IT it(poly.points());
-  Pta* pta = ptaCreate(it.length());
+  Pta *pta = ptaCreate(it.length());
  int num_pts = 0;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {
-    ICOORD* pt = it.data();
+    ICOORD *pt = it.data();
    // Convert to top-down coords within the input image.
    int x = static_cast<float>(pt->x()) / scale_ + rect_left_;
    int y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;
@ -413,20 +409,19 @@ Pta* PageIterator::BlockPolygon() const {
 * should be good, even with xor, since the images come from the connected
 * components.
 */
-Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
+Pix *PageIterator::GetBinaryImage(PageIteratorLevel level) const {
  int left, top, right, bottom;
  if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
    return nullptr;
-  if (level == RIL_SYMBOL && cblob_it_ != nullptr &&
-      cblob_it_->data()->area() != 0)
+  if (level == RIL_SYMBOL && cblob_it_ != nullptr && cblob_it_->data()->area() != 0)
    return cblob_it_->data()->render();
-  Box* box = boxCreate(left, top, right - left, bottom - top);
-  Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);
+  Box *box = boxCreate(left, top, right - left, bottom - top);
+  Pix *pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);
  boxDestroy(&box);
  if (level == RIL_BLOCK || level == RIL_PARA) {
    // Clip to the block polygon as well.
    TBOX mask_box;
-    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    Pix *mask = it_->block()->block->render_mask(&mask_box);
    int mask_x = left - mask_box.left();
    int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
    // AND the mask and pix, putting the result in pix.
@ -449,9 +444,8 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
 * If you do not supply an original image, you will get a binary one.
 * Use pixDestroy to delete the image after use.
 */
-Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
-                            Pix* original_img,
-                            int* left, int* top) const {
+Pix *PageIterator::GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left,
+                            int *top) const {
  int right, bottom;
  if (!BoundingBox(level, left, top, &right, &bottom))
    return nullptr;
@ -463,24 +457,23 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
  *top = std::max(*top - padding, 0);
  right = std::min(right + padding, rect_width_);
  bottom = std::min(bottom + padding, rect_height_);
-  Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
-  Pix* grey_pix = pixClipRectangle(original_img, box, nullptr);
+  Box *box = boxCreate(*left, *top, right - *left, bottom - *top);
+  Pix *grey_pix = pixClipRectangle(original_img, box, nullptr);
  boxDestroy(&box);
  if (level == RIL_BLOCK || level == RIL_PARA) {
    // Clip to the block polygon as well.
    TBOX mask_box;
-    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    Pix *mask = it_->block()->block->render_mask(&mask_box);
    // Copy the mask registered correctly into an image the size of grey_pix.
    int mask_x = *left - mask_box.left();
    int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
    int width = pixGetWidth(grey_pix);
    int height = pixGetHeight(grey_pix);
-    Pix* resized_mask = pixCreate(width, height, 1);
-    pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width, height,
-                PIX_SRC, mask, std::max(0, mask_x), std::max(0, mask_y));
+    Pix *resized_mask = pixCreate(width, height, 1);
+    pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width, height, PIX_SRC,
+                mask, std::max(0, mask_x), std::max(0, mask_y));
    pixDestroy(&mask);
-    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
-                   2 * padding + 1);
+    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1, 2 * padding + 1);
    pixInvert(resized_mask, resized_mask);
    pixSetMasked(grey_pix, resized_mask, UINT32_MAX);
    pixDestroy(&resized_mask);
@ -493,14 +486,13 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
 * The baseline is the line that passes through (x1, y1) and (x2, y2).
 * WARNING: with vertical text, baselines may be vertical!
 */
-bool PageIterator::Baseline(PageIteratorLevel level,
-                            int* x1, int* y1, int* x2, int* y2) const {
-  if (it_->word() == nullptr) return false;  // Already at the end!
-  ROW* row = it_->row()->row;
-  WERD* word = it_->word()->word;
-  TBOX box = (level == RIL_WORD || level == RIL_SYMBOL)
-           ? word->bounding_box()
-           : row->bounding_box();
+bool PageIterator::Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const {
+  if (it_->word() == nullptr)
+    return false; // Already at the end!
+  ROW *row = it_->row()->row;
+  WERD *word = it_->word()->word;
+  TBOX box =
+      (level == RIL_WORD || level == RIL_SYMBOL) ? word->bounding_box() : row->bounding_box();
  int left = box.left();
  ICOORD startpt(left, static_cast<int16_t>(row->base_line(left) + 0.5));
  int right = box.right();
@ -519,7 +511,7 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
                               tesseract::WritingDirection *writing_direction,
                               tesseract::TextlineOrder *textline_order,
                               float *deskew_angle) const {
-  BLOCK* block = it_->block()->block;
+  BLOCK *block = it_->block()->block;

  // Orientation
  FCOORD up_in_image(0.0, 1.0);
@ -541,30 +533,23 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
  // Writing direction
  bool is_vertical_text = (block->classify_rotation().x() == 0.0);
  bool right_to_left = block->right_to_left();
-  *writing_direction =
-      is_vertical_text
-          ? WRITING_DIRECTION_TOP_TO_BOTTOM
-          : (right_to_left
-                ? WRITING_DIRECTION_RIGHT_TO_LEFT
-                : WRITING_DIRECTION_LEFT_TO_RIGHT);
+  *writing_direction = is_vertical_text ? WRITING_DIRECTION_TOP_TO_BOTTOM
+                                        : (right_to_left ? WRITING_DIRECTION_RIGHT_TO_LEFT
+                                                         : WRITING_DIRECTION_LEFT_TO_RIGHT);

  // Textline Order
-  const bool is_mongolian = false;  // TODO(eger): fix me
-  *textline_order = is_vertical_text
-      ? (is_mongolian
-         ? TEXTLINE_ORDER_LEFT_TO_RIGHT
-         : TEXTLINE_ORDER_RIGHT_TO_LEFT)
-      : TEXTLINE_ORDER_TOP_TO_BOTTOM;
+  const bool is_mongolian = false; // TODO(eger): fix me
+  *textline_order = is_vertical_text ? (is_mongolian ? TEXTLINE_ORDER_LEFT_TO_RIGHT
+                                                     : TEXTLINE_ORDER_RIGHT_TO_LEFT)
+                                     : TEXTLINE_ORDER_TOP_TO_BOTTOM;

  // Deskew angle
-  FCOORD skew = block->skew();  // true horizontal for textlines
+  FCOORD skew = block->skew(); // true horizontal for textlines
  *deskew_angle = -skew.angle();
 }

-void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
-                                 bool *is_list_item,
-                                 bool *is_crown,
-                                 int *first_line_indent) const {
+void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just, bool *is_list_item,
+                                 bool *is_crown, int *first_line_indent) const {
  *just = tesseract::JUSTIFICATION_UNKNOWN;
  if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
      !it_->row()->row->para()->model)
@ -573,8 +558,7 @@ void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
  PARA *para = it_->row()->row->para();
  *is_list_item = para->is_list_item;
  *is_crown = para->is_very_first_or_continuation;
-  *first_line_indent = para->model->first_indent() -
-      para->model->body_indent();
+  *first_line_indent = para->model->first_indent() - para->model->body_indent();
  *just = para->model->justification();
 }

@ -583,7 +567,7 @@ void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
 * moves the iterator to the given offset.
 */
 void PageIterator::BeginWord(int offset) {
-  WERD_RES* word_res = it_->word();
+  WERD_RES *word_res = it_->word();
  if (word_res == nullptr) {
    // This is a non-text block, so there is no word.
    word_length_ = 0;
@ -597,9 +581,8 @@ void PageIterator::BeginWord(int offset) {
    word_length_ = word_res->best_choice->length();
    if (word_res->box_word != nullptr) {
      if (word_res->box_word->length() != word_length_) {
-        tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
-                word_length_, word_res->best_choice->unichar_string().c_str(),
-                word_res->box_word->length());
+        tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ", word_length_,
+                word_res->best_choice->unichar_string().c_str(), word_res->box_word->length());
        word_res->box_word->bounding_box().print();
      }
      ASSERT_HOST(word_res->box_word->length() == word_length_);
@ -613,7 +596,8 @@ void PageIterator::BeginWord(int offset) {
    word_ = word_res->word;
    ASSERT_HOST(word_->cblob_list() != nullptr);
    word_length_ = word_->cblob_list()->length();
-    if (cblob_it_ == nullptr) cblob_it_ = new C_BLOB_IT;
+    if (cblob_it_ == nullptr)
+      cblob_it_ = new C_BLOB_IT;
    cblob_it_->set_to_list(word_->cblob_list());
  }
  for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
@ -631,4 +615,4 @@ bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
  }
 }

-}  // namespace tesseract.
+} // namespace tesseract.
--- a/src/ccmain/pagesegmain.cpp
+++ b/src/ccmain/pagesegmain.cpp
@ -17,28 +17,30 @@
 **********************************************************************/

 #ifdef _WIN32
-#ifndef unlink
-#include <io.h>
-#endif
+#  ifndef unlink
+#    include <io.h>
+#  endif
 #else
-#include <unistd.h>
-#endif  // _WIN32
+#  include <unistd.h>
+#endif // _WIN32

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
-#include "config_auto.h"
+#  include "config_auto.h"
 #endif

-#include "allheaders.h"
+#include <allheaders.h>
 #include "blobbox.h"
 #include "blread.h"
 #include "colfind.h"
 #include "debugpixa.h"
-#include "equationdetect.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "equationdetect.h"
+#endif
+#include <tesseract/osdetect.h>
 #include "imagefind.h"
 #include "linefind.h"
 #include "makerow.h"
-#include <tesseract/osdetect.h>
 #include "tabvector.h"
 #include "tesseractclass.h"
 #include "tessvars.h"
@ -56,19 +58,19 @@ const int kMaxCircleErosions = 8;
 // The returned pix must be pixDestroyed after use. nullptr may be returned
 // if the image doesn't meet the trivial conditions that it uses to determine
 // success.
-static Pix* RemoveEnclosingCircle(Pix* pixs) {
-  Pix* pixsi = pixInvert(nullptr, pixs);
-  Pix* pixc = pixCreateTemplate(pixs);
+static Pix *RemoveEnclosingCircle(Pix *pixs) {
+  Pix *pixsi = pixInvert(nullptr, pixs);
+  Pix *pixc = pixCreateTemplate(pixs);
  pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
  pixSeedfillBinary(pixc, pixc, pixsi, 4);
  pixInvert(pixc, pixc);
  pixDestroy(&pixsi);
-  Pix* pixt = pixAnd(nullptr, pixs, pixc);
+  Pix *pixt = pixAnd(nullptr, pixs, pixc);
  l_int32 max_count;
  pixCountConnComp(pixt, 8, &max_count);
  // The count has to go up before we start looking for the minimum.
  l_int32 min_count = INT32_MAX;
-  Pix* pixout = nullptr;
+  Pix *pixout = nullptr;
  for (int i = 1; i < kMaxCircleErosions; i++) {
    pixDestroy(&pixt);
    pixErodeBrick(pixc, pixc, 3, 3);
@ -78,12 +80,12 @@ static Pix* RemoveEnclosingCircle(Pix* pixs) {
    if (i == 1 || count > max_count) {
      max_count = count;
      min_count = count;
-    } else if (i > 1 && count < min_count) {
+    } else if (count < min_count) {
      min_count = count;
      pixDestroy(&pixout);
-      pixout = pixCopy(nullptr, pixt);  // Save the best.
+      pixout = pixCopy(nullptr, pixt); // Save the best.
    } else if (count >= min_count) {
-      break;  // We have passed by the best.
+      break; // We have passed by the best.
    }
  }
  pixDestroy(&pixt);
@ -96,19 +98,17 @@ static Pix* RemoveEnclosingCircle(Pix* pixs) {
 * pix_binary_ is used as the source image and should not be nullptr.
 * On return the blocks list owns all the constructed page layout.
 */
-int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
-                           Tesseract* osd_tess, OSResults* osr) {
+int Tesseract::SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess,
+                           OSResults *osr) {
  ASSERT_HOST(pix_binary_ != nullptr);
  int width = pixGetWidth(pix_binary_);
  int height = pixGetHeight(pix_binary_);
  // Get page segmentation mode.
-  auto pageseg_mode = static_cast<PageSegMode>(
-      static_cast<int>(tessedit_pageseg_mode));
+  auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
  // If a UNLV zone file can be found, use that instead of segmentation.
-  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
-      input_file != nullptr && input_file->length() > 0) {
-    STRING name = *input_file;
-    const char* lastdot = strrchr(name.c_str(), '.');
+  if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file[0] != '\0') {
+    std::string name = input_file;
+    const char *lastdot = strrchr(name.c_str(), '.');
    if (lastdot != nullptr)
      name[lastdot - name.c_str()] = '\0';
    read_unlv_file(name, width, height, blocks);
@ -117,7 +117,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
    // No UNLV file present. Work according to the PageSegMode.
    // First make a single block covering the whole image.
    BLOCK_IT block_it(blocks);
-    auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
+    auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
    block->set_right_to_left(right_to_left());
    block_it.add_to_end(block);
  } else {
@ -135,9 +135,9 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
  TO_BLOCK_LIST to_blocks;
  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
      PSM_SPARSE(pageseg_mode)) {
-    auto_page_seg_ret_val = AutoPageSeg(
-        pageseg_mode, blocks, &to_blocks,
-        enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
+    auto_page_seg_ret_val =
+        AutoPageSeg(pageseg_mode, blocks, &to_blocks,
+                    enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
    if (pageseg_mode == PSM_OSD_ONLY)
      return auto_page_seg_ret_val;
    // To create blobs from the image region bounds uncomment this line:
@ -146,7 +146,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
    deskew_ = FCOORD(1.0f, 0.0f);
    reskew_ = FCOORD(1.0f, 0.0f);
    if (pageseg_mode == PSM_CIRCLE_WORD) {
-      Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
+      Pix *pixcleaned = RemoveEnclosingCircle(pix_binary_);
      if (pixcleaned != nullptr) {
        pixDestroy(&pix_binary_);
        pix_binary_ = pixcleaned;
@ -161,15 +161,13 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
  if (blocks->empty()) {
    if (textord_debug_tabfind)
      tprintf("Empty page\n");
-    return 0;  // AutoPageSeg found an empty page.
+    return 0; // AutoPageSeg found an empty page.
  }
-  bool splitting =
-      pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
+  bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
  bool cjk_mode = textord_use_cjk_fp_model;

-  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
-                       pix_thresholds_, pix_grey_, splitting || cjk_mode,
-                       &diacritic_blobs, blocks, &to_blocks);
+  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_,
+                       pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks);
  return auto_page_seg_ret_val;
 }

@ -197,42 +195,42 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
 * another Tesseract that was initialized especially for osd, and the results
 * will be output into osr (orientation and script result).
 */
-int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
-                           TO_BLOCK_LIST* to_blocks,
-                           BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
-                           OSResults* osr) {
-  Pix* photomask_pix = nullptr;
-  Pix* musicmask_pix = nullptr;
+int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,
+                           BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) {
+  Pix *photomask_pix = nullptr;
+  Pix *musicmask_pix = nullptr;
  // The blocks made by the ColumnFinder. Moved to blocks before return.
  BLOCK_LIST found_blocks;
  TO_BLOCK_LIST temp_blocks;

-  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
+  ColumnFinder *finder = SetupPageSegAndDetectOrientation(
      pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
      pageseg_apply_music_mask ? &musicmask_pix : nullptr);
  int result = 0;
  if (finder != nullptr) {
    TO_BLOCK_IT to_block_it(&temp_blocks);
-    TO_BLOCK* to_block = to_block_it.data();
+    TO_BLOCK *to_block = to_block_it.data();
    if (musicmask_pix != nullptr) {
      // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
      // blocks separately. For now combine with photomask_pix.
      pixOr(photomask_pix, photomask_pix, musicmask_pix);
    }
+#ifndef DISABLED_LEGACY_ENGINE
    if (equ_detect_) {
      finder->SetEquationDetect(equ_detect_);
    }
-    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
-                                to_block, photomask_pix, pix_thresholds_,
-                                pix_grey_, &pixa_debug_, &found_blocks,
-                                diacritic_blobs, to_blocks);
+#endif // ndef DISABLED_LEGACY_ENGINE
+    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
+                                photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
+                                &found_blocks, diacritic_blobs, to_blocks);
    if (result >= 0)
      finder->GetDeskewVectors(&deskew_, &reskew_);
    delete finder;
  }
  pixDestroy(&photomask_pix);
  pixDestroy(&musicmask_pix);
-  if (result < 0) return result;
+  if (result < 0)
+    return result;

  blocks->clear();
  BLOCK_IT block_it(blocks);
@ -243,12 +241,11 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,

 // Helper adds all the scripts from sid_set converted to ids from osd_set to
 // allowed_ids.
-static void AddAllScriptsConverted(const UNICHARSET& sid_set,
-                                   const UNICHARSET& osd_set,
-                                   GenericVector<int>* allowed_ids) {
+static void AddAllScriptsConverted(const UNICHARSET &sid_set, const UNICHARSET &osd_set,
+                                   std::vector<int> *allowed_ids) {
  for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
    if (i != sid_set.null_sid()) {
-      const char* script = sid_set.get_script_from_script_id(i);
+      const char *script = sid_set.get_script_from_script_id(i);
      allowed_ids->push_back(osd_set.get_script_id_from_name(script));
    }
  }
@ -258,19 +255,20 @@ static void AddAllScriptsConverted(const UNICHARSET& sid_set,
 * Sets up auto page segmentation, determines the orientation, and corrects it.
 * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
 * facilitate testing.
- * photo_mask_pix is a pointer to a nullptr pointer that will be filled on return
- * with the leptonica photo mask, which must be pixDestroyed by the caller.
- * to_blocks is an empty list that will be filled with (usually a single)
- * block that is used during layout analysis. This ugly API is required
+ * photo_mask_pix is a pointer to a nullptr pointer that will be filled on
+ * return with the leptonica photo mask, which must be pixDestroyed by the
+ * caller. to_blocks is an empty list that will be filled with (usually a
+ * single) block that is used during layout analysis. This ugly API is required
 * because of the possibility of a unlv zone file.
 * TODO(rays) clean this up.
 * See AutoPageSeg for other arguments.
 * The returned ColumnFinder must be deleted after use.
 */
-ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
-    PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
-    OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
-    Pix** music_mask_pix) {
+ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode,
+                                                          BLOCK_LIST *blocks, Tesseract *osd_tess,
+                                                          OSResults *osr, TO_BLOCK_LIST *to_blocks,
+                                                          Pix **photo_mask_pix,
+                                                          Pix **music_mask_pix) {
  int vertical_x = 0;
  int vertical_y = 1;
  TabVector_LIST v_lines;
@ -282,19 +280,25 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    pixa_debug_.AddPix(pix_binary_, "PageSegInput");
  }
  // Leptonica is used to find the rule/separator lines in the input.
-  LineFinder::FindAndRemoveLines(source_resolution_,
-                                 textord_tabfind_show_vlines, pix_binary_,
-                                 &vertical_x, &vertical_y, music_mask_pix,
-                                 &v_lines, &h_lines);
+  LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
+                                 &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
  if (tessedit_dump_pageseg_images) {
    pixa_debug_.AddPix(pix_binary_, "NoLines");
  }
  // Leptonica is used to find a mask of the photo regions in the input.
  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
  if (tessedit_dump_pageseg_images) {
-    pixa_debug_.AddPix(pix_binary_, "NoImages");
+    Pix *pix_no_image_ = nullptr;
+    if (*photo_mask_pix != nullptr) {
+      pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
+    } else {
+      pix_no_image_ = pixClone(pix_binary_);
+    }
+    pixa_debug_.AddPix(pix_no_image_, "NoImages");
+    pixDestroy(&pix_no_image_);
  }
-  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
+  if (!PSM_COL_FIND_ENABLED(pageseg_mode))
+    v_lines.clear();

  // The rest of the algorithm uses the usual connected components.
  textord_.find_components(pix_binary_, blocks, to_blocks);
@ -303,9 +307,9 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
  // There must be exactly one input block.
  // TODO(rays) handle new textline finding with a UNLV zone file.
  ASSERT_HOST(to_blocks->singleton());
-  TO_BLOCK* to_block = to_block_it.data();
+  TO_BLOCK *to_block = to_block_it.data();
  TBOX blkbox = to_block->block->pdblk.bounding_box();
-  ColumnFinder* finder = nullptr;
+  ColumnFinder *finder = nullptr;
  int estimated_resolution = source_resolution_;
  if (source_resolution_ == kMinCredibleResolution) {
    // Try to estimate resolution from typical body text size.
@ -317,11 +321,10 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
  }

  if (to_block->line_size >= 2) {
-    finder = new ColumnFinder(static_cast<int>(to_block->line_size),
-                              blkbox.botleft(), blkbox.topright(),
-                              estimated_resolution, textord_use_cjk_fp_model,
-                              textord_tabfind_aligned_gap_fraction, &v_lines,
-                              &h_lines, vertical_x, vertical_y);
+    finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(),
+                              blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model,
+                              textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x,
+                              vertical_y);

    finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);

@ -337,23 +340,20 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    // We want the text lines horizontal, (vertical text indicates vertical
    // textlines) which may conflict (eg vertically written CJK).
    int osd_orientation = 0;
-    bool vertical_text = textord_tabfind_force_vertical_text ||
-                         pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
-    if (!vertical_text && textord_tabfind_vertical_text &&
-        PSM_ORIENTATION_ENABLED(pageseg_mode)) {
-      vertical_text =
-          finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
-                                          to_block, &osd_blobs);
+    bool vertical_text =
+        textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
+    if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) {
+      vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block,
+                                                      &osd_blobs);
    }
    if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
-      GenericVector<int> osd_scripts;
+      std::vector<int> osd_scripts;
      if (osd_tess != this) {
        // We are running osd as part of layout analysis, so constrain the
        // scripts to those allowed by *this.
        AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
        for (int s = 0; s < sub_langs_.size(); ++s) {
-          AddAllScriptsConverted(sub_langs_[s]->unicharset,
-                                 osd_tess->unicharset, &osd_scripts);
+          AddAllScriptsConverted(sub_langs_[s]->unicharset, osd_tess->unicharset, &osd_scripts);
        }
      }
      os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
@ -365,20 +365,17 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
      double osd_score = osr->orientations[osd_orientation];
      double osd_margin = min_orientation_margin * 2;
      for (int i = 0; i < 4; ++i) {
-        if (i != osd_orientation &&
-            osd_score - osr->orientations[i] < osd_margin) {
+        if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) {
          osd_margin = osd_score - osr->orientations[i];
        }
      }
      int best_script_id = osr->best_result.script_id;
-      const char* best_script_str =
-          osd_tess->unicharset.get_script_from_script_id(best_script_id);
+      const char *best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id);
      bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
-          best_script_id == osd_tess->unicharset.hiragana_sid() ||
-          best_script_id == osd_tess->unicharset.katakana_sid() ||
-          strcmp("Japanese", best_script_str) == 0 ||
-          strcmp("Korean", best_script_str) == 0 ||
-          strcmp("Hangul", best_script_str) == 0;
+                 best_script_id == osd_tess->unicharset.hiragana_sid() ||
+                 best_script_id == osd_tess->unicharset.katakana_sid() ||
+                 strcmp("Japanese", best_script_str) == 0 ||
+                 strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0;
      if (cjk) {
        finder->set_cjk_script(true);
      }
@ -386,8 +383,10 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
        // The margin is weak.
        if (!cjk && !vertical_text && osd_orientation == 2) {
          // upside down latin text is improbable with such a weak margin.
-          tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
-                  "Don't rotate.\n", osd_margin);
+          tprintf(
+              "OSD: Weak margin (%.2f), horiz textlines, not CJK: "
+              "Don't rotate.\n",
+              osd_margin);
          osd_orientation = 0;
        } else {
          tprintf(
@ -400,10 +399,10 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    osd_blobs.shallow_clear();
    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);

-#endif  // ndef DISABLED_LEGACY_ENGINE
+#endif // ndef DISABLED_LEGACY_ENGINE
  }

  return finder;
 }

-}  // namespace tesseract.
+} // namespace tesseract.
--- a/src/ccmain/pagewalk.cpp
+++ b/src/ccmain/pagewalk.cpp
@ -28,16 +28,14 @@ namespace tesseract {
 * to each word that overlaps the selection_box.
 */
 void Tesseract::process_selected_words(
-        PAGE_RES* page_res, // blocks to check
-        TBOX& selection_box,
-        bool (tesseract::Tesseract::* word_processor)(PAGE_RES_IT* pr_it)) {
-  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr;
-       page_res_it.forward()) {
-    WERD* word = page_res_it.word()->word;
+    PAGE_RES *page_res, // blocks to check
+    TBOX &selection_box, bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) {
+  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; page_res_it.forward()) {
+    WERD *word = page_res_it.word()->word;
    if (word->bounding_box().overlap(selection_box)) {
      if (!(this->*word_processor)(&page_res_it))
        return;
    }
  }
 }
-}  // namespace tesseract
+} // namespace tesseract
--- a/src/ccmain/par_control.cpp
+++ b/src/ccmain/par_control.cpp
@ -18,32 +18,31 @@

 #include "tesseractclass.h"
 #ifdef _OPENMP
-#include <omp.h>
-#endif  // _OPENMP
+#  include <omp.h>
+#endif // _OPENMP

 namespace tesseract {

 struct BlobData {
  BlobData() = default;
-  BlobData(int index, Tesseract* tess, const WERD_RES& word)
-    : blob(word.chopped_word->blobs[index]),
-      tesseract(tess),
-      choices(&(*word.ratings)(index, index)) {}
+  BlobData(int index, Tesseract *tess, const WERD_RES &word)
+      : blob(word.chopped_word->blobs[index])
+      , tesseract(tess)
+      , choices(&(*word.ratings)(index, index)) {}

-  TBLOB* blob = nullptr;
-  Tesseract* tesseract = nullptr;
-  BLOB_CHOICE_LIST** choices = nullptr;
+  TBLOB *blob = nullptr;
+  Tesseract *tesseract = nullptr;
+  BLOB_CHOICE_LIST **choices = nullptr;
 };

-void Tesseract::PrerecAllWordsPar(const GenericVector<WordData>& words) {
+void Tesseract::PrerecAllWordsPar(const std::vector<WordData> &words) {
  // Prepare all the blobs.
-  GenericVector<BlobData> blobs;
-  for (int w = 0; w < words.size(); ++w) {
-    if (words[w].word->ratings != nullptr &&
-        words[w].word->ratings->get(0, 0) == nullptr) {
+  std::vector<BlobData> blobs;
+  for (size_t w = 0; w < words.size(); ++w) {
+    if (words[w].word->ratings != nullptr && words[w].word->ratings->get(0, 0) == nullptr) {
      for (int s = 0; s < words[w].lang_words.size(); ++s) {
-        Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
-        const WERD_RES& word = *words[w].lang_words[s];
+        Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;
+        const WERD_RES &word = *words[w].lang_words[s];
        for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
          blobs.push_back(BlobData(b, sub, word));
        }
@ -53,19 +52,19 @@ void Tesseract::PrerecAllWordsPar(const GenericVector<WordData>& words) {
  // Pre-classify all the blobs.
  if (tessedit_parallelize > 1) {
 #ifdef _OPENMP
-#pragma omp parallel for num_threads(10)
-#endif  // _OPENMP
-    for (int b = 0; b < blobs.size(); ++b) {
+#  pragma omp parallel for num_threads(10)
+#endif // _OPENMP
+    for (size_t b = 0; b < blobs.size(); ++b) {
      *blobs[b].choices =
-          blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
+          blobs[b].tesseract->classify_blob(blobs[b].blob, "par", ScrollView::WHITE, nullptr);
    }
  } else {
    // TODO(AMD) parallelize this.
-    for (int b = 0; b < blobs.size(); ++b) {
+    for (size_t b = 0; b < blobs.size(); ++b) {
      *blobs[b].choices =
-          blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
+          blobs[b].tesseract->classify_blob(blobs[b].blob, "par", ScrollView::WHITE, nullptr);
    }
  }
 }

-}  // namespace tesseract.
+} // namespace tesseract.
--- a/src/ccmain/paragraphs.cpp
+++ b/src/ccmain/paragraphs.cpp
--- a/src/ccmain/paragraphs.h
+++ b/src/ccmain/paragraphs.h
@ -20,44 +20,44 @@
 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
 #define TESSERACT_CCMAIN_PARAGRAPHS_H_

-#include "rect.h"    // for TBOX
-#include <tesseract/strngs.h>  // for STRING
-
-class PARA_LIST;
-class ParagraphModel;
-
-struct PARA;
-
-template <typename T> class GenericVector;
+#include <list>
+#include <string>
+#include "rect.h"   // for TBOX

 namespace tesseract {

 class MutableIterator;
+class ParagraphModel;
+class PARA_LIST;
+struct PARA;
+
+template <typename T>
+class GenericVector;

 // This structure captures all information needed about a text line for the
 // purposes of paragraph detection.  It is meant to be exceedingly light-weight
 // so that we can easily test paragraph detection independent of the rest of
 // Tesseract.
 class RowInfo {
- public:
+public:
  // Constant data derived from Tesseract output.
-  STRING text;        // the full UTF-8 text of the line.
-  bool ltr;           // whether the majority of the text is left-to-right
-                      // TODO(eger) make this more fine-grained.
+  std::string text; // the full UTF-8 text of the line.
+  bool ltr;    // whether the majority of the text is left-to-right
+               // TODO(eger) make this more fine-grained.

-  bool has_leaders;   // does the line contain leader dots (.....)?
-  bool has_drop_cap;  // does the line have a drop cap?
-  int pix_ldistance;  // distance to the left pblock boundary in pixels
-  int pix_rdistance;  // distance to the right pblock boundary in pixels
-  float pix_xheight;  // guessed xheight for the line
+  bool has_leaders;            // does the line contain leader dots (.....)?
+  bool has_drop_cap;           // does the line have a drop cap?
+  int pix_ldistance;           // distance to the left pblock boundary in pixels
+  int pix_rdistance;           // distance to the right pblock boundary in pixels
+  float pix_xheight;           // guessed xheight for the line
  int average_interword_space; // average space between words in pixels.

  int num_words;
-  TBOX lword_box;     // in normalized (horiz text rows) space
-  TBOX rword_box;     // in normalized (horiz text rows) space
+  TBOX lword_box; // in normalized (horiz text rows) space
+  TBOX rword_box; // in normalized (horiz text rows) space

-  STRING lword_text;   // the UTF-8 text of the leftmost werd
-  STRING rword_text;   // the UTF-8 text of the rightmost werd
+  std::string lword_text; // the UTF-8 text of the leftmost werd
+  std::string rword_text; // the UTF-8 text of the rightmost werd

  //   The text of a paragraph typically starts with the start of an idea and
  // ends with the end of an idea.  Here we define paragraph as something that
@ -88,22 +88,20 @@ class RowInfo {
 //   paragraphs - this is the actual list of PARA objects.
 //   models - the list of paragraph models referenced by the PARA objects.
 //            caller is responsible for deleting the models.
-void DetectParagraphs(int debug_level,
-                      GenericVector<RowInfo> *row_infos,
-                      GenericVector<PARA *> *row_owners,
-                      PARA_LIST *paragraphs,
-                      GenericVector<ParagraphModel *> *models);
+TESS_API
+void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
+                      GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs,
+                      std::vector<ParagraphModel *> *models);

 // Given a MutableIterator to the start of a block, run DetectParagraphs on
 // that block and commit the results to the underlying ROW and BLOCK structs,
 // saving the ParagraphModels in models.  Caller owns the models.
 // We use unicharset during the function to answer questions such as "is the
 // first letter of this word upper case?"
-void DetectParagraphs(int debug_level,
-                      bool after_text_recognition,
-                      const MutableIterator *block_start,
-                      GenericVector<ParagraphModel *> *models);
+TESS_API
+void DetectParagraphs(int debug_level, bool after_text_recognition,
+                      const MutableIterator *block_start, std::vector<ParagraphModel *> *models);

-}  // namespace
+} // namespace tesseract

-#endif  // TESSERACT_CCMAIN_PARAGRAPHS_H_
+#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
--- a/Show More
+++ b/Show More